knitr::opts_chunk$set(echo = TRUE) library(sjPlot)
The dataset default.csv indicates if individuals defaulted on their credit debt. a. Split the dataset into a training and test set. please consider below answer.
pacman::p_load(dplyr, tidyr ,cowplot
, tidyverse , viridis , GGally)
default <- read_csv("default.csv")
## Rows: 10000 Columns: 4
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): default, student
## dbl (2): balance, income
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(default) # we have these cols "default" "student" "balance" "income"
## [1] "default" "student" "balance" "income"
head(default) # shows head of data
dim(default) # rows= 10000 col= 4
## [1] 10000 4
# overview of our data
ggpairs(default, title="correlogram with ggpairs()" , colour = "income") +
theme(panel.grid.major = element_blank())
## Warning in warn_if_args_exist(list(...)): Extra arguments: "colour" are being
## ignored. If these are meant to be aesthetics, submit them using the 'mapping'
## variable within ggpairs with ggplot2::aes or ggplot2::aes_string.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
I learned what is factor and how to use it
source : https://www.youtube.com/watch?v=xkRBfy8_2MU
# https://rdrr.io/cran/ISLR/man/Default.html
# A data frame with 10000 observations on the following 4 variables.
#
# default -> A factor with levels No and Yes indicating whether the customer defaulted on their debt
#
# student ->
# A factor with levels No and Yes indicating whether the customer is a student
#
# balance ->
# The average balance that the customer has remaining on their
# credit card after making their monthly payment
#
# income -> Income of customer
#
set.seed(12345) # provided you use the same pseudo-random number generator
tail(default$default , 10) #tail: "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
## [1] "No" "No" "No" "No" "No" "No" "No" "No" "No" "No"
##################################################################
## ifelse condition: No =1, otherwise 0 ##
##################################################################
default$default <- ifelse(default$default == "Yes", 1, 0)
dim(default)
## [1] 10000 4
sum(default$default) # how many 1 do we have 333
## [1] 333
dim(default)[1] - sum(default$default) # how many 0 do we have 9667
## [1] 9667
## 0.8015% of the sample size
smp_size <- floor(0.8015 * nrow(default))
smp_size
## [1] 8015
## set the seed to make your partition reproducible
train_res_ <- sample(seq_len(nrow(default)), size = smp_size)
tail(train_res_)
## [1] 9895 1848 2754 6956 3024 9256
train <- default[train_res_, ]
dim(train) # row= 8015 col = 4
## [1] 8015 4
head(train)
test <- default[-train_res_, ]
head(test)
dim(test) # 1985 4
## [1] 1985 4
##################################################################
## Copy code from lecture ##
##################################################################
set.seed(12345)
index <- runif(nrow(default))
default$type <- ifelse(index < 0.80, "train", "test") # divided data two part, train & test
###########################################################################
###########################################################################
### ###
### WHAT IS FACTOR AND WHEN WE SHOULD USE IT ###
### ###
###########################################################################
###########################################################################
# In R, factors are used to work with categorical variables,
# variables that have a fixed and known set of possible values.
# They are also useful when you want to display character vectors
# in a non-alphabetical order.
# source: https://r4ds.had.co.nz/factors.html#:~:text=In%20R%2C%20factors%20are%20used,to%20work%20with%20than%20characters.
default$student <- factor(default$student)
str(default$student) # display factor
## Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
train <- default %>% filter(type == "train") %>% select(-type)
test <- default %>% filter(type == "test") %>% select(-type)
dim(train)
## [1] 8015 4
#################################################################
## Part 1 is done! ##
#################################################################
#################################################################
## add UW logo ##
#################################################################
pacman::p_load(dplyr,modelr , tidyr ,cowplot , gapminder , tidyverse , viridis)
logo_file <- system.file("extdata", "logo.png", package = "cowplot")
##################################################################
## Generalized linear model ##
##################################################################
# Source: https://stats.oarc.ucla.edu/r/dae/logit-regression/
###################################################################################
###################################################################################
### ###
### GENERALISED LINEAR MODELS, E.G. STATS::GLM(). LINEAR MODELS ASSUME THAT ###
### THE RESPONSE IS CONTINUOUS AND THE ERROR HAS A NORMAL DISTRIBUTION. ###
### GENERALISED LINEAR MODELS EXTEND LINEAR MODELS TO INCLUDE NON-CONTINUOUS ###
### RESPONSES (E.G. BINARY DATA OR COUNTS). THEY WORK BY DEFINING A DISTANCE ###
### METRIC BASED ON THE STATISTICAL IDEA OF LIKELIHOOD. SOURCE : ###
### HTTPS://R4DS.HAD.CO.NZ/MODEL-BASICS.HTML#MISSING-VALUES-5 ###
### ###
###################################################################################
###################################################################################
model_glm <- glm(default ~ ., family="binomial", data=train)
confint(model_glm)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -1.239564e+01 -1.016772e+01
## studentYes -1.100433e+00 -5.835154e-02
## balance 5.363168e-03 6.419470e-03
## income -1.088558e-05 2.498355e-05
#
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -1.226004e+01 -1.007588e+01
## studentYes -1.269516e+00 -2.169029e-01
## balance 5.444101e-03 6.502409e-03
## income -1.088558e-05 2.498355e-05
summary(model_glm)
##
## Call:
## glm(formula = default ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5262 -0.1356 -0.0530 -0.0186 3.7666
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.125e+01 5.679e-01 -19.809 <2e-16 ***
## studentYes -5.813e-01 2.656e-01 -2.189 0.0286 *
## balance 5.874e-03 2.692e-04 21.819 <2e-16 ***
## income 7.023e-06 9.144e-06 0.768 0.4425
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2287.4 on 8014 degrees of freedom
## Residual deviance: 1220.8 on 8011 degrees of freedom
## AIC: 1228.8
##
## Number of Fisher Scoring iterations: 8
###########################################################################
###########################################################################
### ###
### INTERPERTATION ###
### ###
###########################################################################
###########################################################################
#
# It can be seen that only 2 out of the 3 predictors are significantly
# associated to the outcome. These include: studentYes pvalue = 0.0286
# and balance < 2e-16 .
#
# the z value for the studentYes variable disp is calculated
# as -5.812e-01 / 2.656e-01 = -2.18825301
# the z value for the balance variable disp is calculated
# as 5.874e-03 / 2.692e-04
# The coefficient estimate of the variable is b = -1.125e+01 ,
# which is Negative This means that an increase in variable is associated with
# increase in the probability of being negative.
dim(train)[1]
## [1] 8015
names(train)
## [1] "default" "student" "balance" "income"
# default based on all of the provided predictors,
# and visualize your final predicted model.
#######################################################################################
## tally is a convenient wrapper for summarise that will either call n or ##
## sum(n) depending on whether you're tallying for the first time, or re-tallying. ##
## count() is similar, but also does the group_by for you. ##
#######################################################################################
################################################################################
################################################################################
### ###
### TALLY IS A CONVENIENT WRAPPER FOR SUMMARISE THAT WILL EITHER CALL N OR ###
### SUM(N) DEPENDING ON WHETHER YOU'RE TALLYING FOR THE FIRST TIME, OR ###
### RE-TALLYING. COUNT() IS SIMILAR, BUT ALSO DOES THE GROUP_BY FOR YOU. ###
### ###
################################################################################
################################################################################
data <- train %>%
group_by(default, student) %>%
tally() %>%
mutate(Percent = n / dim(train)[1] )
#
# # Groups: default [2]
# default student n Percent
# <dbl> <chr> <int> <dbl>
# 1 0 No 5129 0.684 5129/7500 ---> Remember how we did
# 2 0 Yes 2126 0.283 2126/7500 ----> ###Important
# 3 1 No 161 0.0215 161/7500
# 4 1 Yes 84 0.0112 84/7500
head(data)
# add logofile
uw_logo<- draw_image(logo_file, x = 1, y = 1,
hjust = 1, vjust = 1, width = 0.13, height = 0.2)
############################################################################
############################################################################
### ###
### OUR PLOT START HERE ###
### ###
############################################################################
############################################################################
# Balanace ~ default
myplot <- ggplot(train, aes(balance, default)) +
geom_jitter(aes(x = balance, y = default) , color = 'turquoise4') +
theme_minimal_grid() +
geom_rug(aes(color=factor(default)), sides="b") +
geom_smooth( method = "glm",
method.args = list(family = "binomial"),
se = FALSE,size= 2 , color="red") +
ggtitle("Plot 1") +
theme(
plot.title = element_text(size = 18),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(
angle = 90,
vjust = 0.1,
hjust = 1 ,
size = 18 , face="bold"
) ,
axis.text.y = element_text(
angle = 0,
vjust = 1,
hjust = 0.1 ,
size = 18 , face="bold"
))
# add logo to my plot
ggdraw(myplot) +
draw_image(logo_file, x = 1, y = 1, hjust = 1,
vjust = 1, width = 0.13, height = 0.2)
## `geom_smooth()` using formula 'y ~ x'
############################################################################
############################################################################
### ###
### OUR PLOT START HERE ###
### ###
############################################################################
############################################################################
factor(data$default) # res : [1] 0 0 1 1
## [1] 0 0 1 1
## Levels: 0 1
# factor(default) ~ Percent
myplot <- ggplot(data, aes(x=factor(default), fill=student, y=Percent)) +
geom_jitter(aes(x = factor(default), y = Percent) , color = 'turquoise4') +
theme_minimal_hgrid()+
# https://www.rdocumentation.org/packages/ggplot2/versions/1.0.1/topics/geom_bar
geom_bar(stat="identity", position="dodge") +
# https://ggplot2.tidyverse.org/reference/geom_text.html
geom_text(aes(label = Percent), size = 3 ,angle = 90, position =
position_dodge(0.9) )+
facet_wrap(~default, scales = "free") +
# Source: https://www.rdocumentation.org/packages/ggplot2/versions/3.3.5/topics/facet_wrap
xlab("default") +ylab("Percent")+
ggtitle("Plot 2") + # plot 2
theme(plot.caption = element_text(size = 18),
plot.tag = element_text(color = "darkred", size = 18),
panel.border = element_rect(color = "steelblue", size = 2),
plot.title = element_text(size = 18),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(
angle = 90,
vjust = 0.1,
hjust = 1 ,
size = 18 , face="bold"
) ,
axis.text.y = element_text(
angle = 0,
vjust = 1,
hjust = 0.1 ,
size = 18 , face="bold"
))
# add logo to my plot
ggdraw(myplot) +
draw_image(logo_file, x = 1, y = 1, hjust = 1,
vjust = 1, width = 0.13, height = 0.2)
############################################################################
############################################################################
### ###
### OUR PLOT START HERE ###
### ###
############################################################################
############################################################################
# income ~ default
myplot <- ggplot(train, aes(income, default)) +
geom_jitter(aes(x = income, y = default) , color = 'turquoise4') +
geom_rug(aes(color=factor(default)), sides="b") +
geom_smooth(method = "glm",
method.args = list(family = "binomial"),
se = FALSE ,size= 2 , color="red") +
theme_minimal_hgrid() +
ggtitle("Plot 3 ") +
theme(plot.caption = element_text(size = 18),
plot.tag = element_text(color = "darkred", size = 18),
panel.border = element_rect(color = "steelblue", size = 2),
plot.title = element_text(size = 18),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(
angle = 90,
vjust = 0.1,
hjust = 1 ,
size = 18 , face="bold"
) ,
axis.text.y = element_text(
angle = 0,
vjust = 1,
hjust = 0.1 ,
size = 18 , face="bold"
))
# add logo to my plot
ggdraw(myplot) +
draw_image(logo_file, x = 1, y = 1, hjust = 1,
vjust = 1, width = 0.13, height = 0.2)
## `geom_smooth()` using formula 'y ~ x'
source : https://r-graph-gallery.com/boxplot.html
#-------------------------------------------------------------
## In-class exercises:
##
## Write a function called logit2prob to convert log-odds to
## the predicted scale
#-------------------------------------------------------------
logit2prob <- function(x) {
return(exp(x) / (1 + exp(x)))
}
##################################################################################
## We’ll use modelr::add_predictions() which takes a data frame and a model. ##
## It adds the predictions from the model to a new column in the data frame: ##
##################################################################################
# Source : https://r4ds.had.co.nz/model-basics.html#missing-values-5
train_res <- train %>%
add_predictions(model_glm) %>%
mutate(pred = logit2prob(pred))
head(train_res)
############################################################################
############################################################################
### ###
### OUR PLOT START HERE ###
### ###
############################################################################
############################################################################
myplot <- ggplot(train_res, aes(x=student, y=pred , fill=student)) +
geom_boxplot( width=0.5,lwd=1.5) + theme_minimal_hgrid() +
scale_y_continuous(trans='log') +
ggtitle("Plot 4") +
theme( panel.border = element_rect(color = "steelblue", size = 2),
plot.title = element_text(size = 18),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(
angle = 90,
vjust = 0.1,
hjust = 1 ,
size = 18 , face="bold"
) ,
axis.text.y = element_text(
angle = 0,
vjust = 1,
hjust = 0.1 ,
size = 18 , face="bold"
))
# add logo to my plot
ggdraw(myplot) +
draw_image(logo_file, x = 1, y = 1, hjust = 1,
vjust = 1, width = 0.13, height = 0.2)
############################################################################
############################################################################
### ###
### OUR PLOT START HERE ###
### ###
############################################################################
############################################################################
myplot <- ggplot(train_res, aes(x=income, y=pred)) +
geom_jitter(aes(x = income, y = pred) , color = 'turquoise4') +
theme_minimal_grid()+geom_point(aes(color=student), alpha=0.3) +
geom_smooth(method = "glm",
method.args = list(family = "binomial"),
se = FALSE ,size= 2 , color="red") +
labs(y = "Predicted probability of default")+
ggtitle("Plot 5") +
theme(plot.caption = element_text(size = 18),
plot.tag = element_text(color = "darkred", size = 18),
panel.border = element_rect(color = "steelblue", size = 2),
plot.title = element_text(size = 18),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(
angle = 90,
vjust = 0.1,
hjust = 1 ,
size = 18 , face="bold"
) ,
axis.text.y = element_text(
angle = 0,
vjust = 1,
hjust = 0.1 ,
size = 18 , face="bold"
))
# add logo to my plot
ggdraw(myplot) +
draw_image(logo_file, x = 1, y = 1, hjust = 1,
vjust = 1, width = 0.13, height = 0.2)
## `geom_smooth()` using formula 'y ~ x'
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
source : https://r-graph-gallery.com/boxplot.html
############################################################################
############################################################################
### ###
### OUR PLOT START HERE ###
### ###
############################################################################
############################################################################
myplot <- ggplot(train_res, aes(x=student, y=balance , fill=student)) +
geom_boxplot(width=0.5,lwd=1.5 ) +
ggtitle("Plot 6") +
theme( plot.title = element_text(size = 18),
#
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(
angle = 90,
vjust = 0.1,
hjust = 1 ,
size = 18 , face="bold"
) ,
axis.text.y = element_text(
angle = 0,
vjust = 1,
hjust = 0.1 ,
size = 18 , face="bold"
) )
# add logo to my plot
ggdraw(myplot) +
draw_image(logo_file, x = 1, y = 1, hjust = 1,
vjust = 1, width = 0.13, height = 0.2)
############################################################################
############################################################################
### ###
### OUR PLOT START HERE ###
### ###
############################################################################
############################################################################
myplot <- ggplot(train_res, aes(x=balance, y=pred ,color=student) ) +
geom_jitter(aes(x = balance, y = pred) , color = 'turquoise4') +
theme_minimal_hgrid()+
geom_point(aes(color=student) ) +
geom_line(aes(color=student) )+
labs(y = "Predicted probability of default") +
ggtitle("Plot 7") +
theme(plot.caption = element_text(size = 18),
plot.tag = element_text(color = "darkred", size = 18),
panel.border = element_rect(color = "steelblue", size = 2),
plot.title = element_text(size = 18),
axis.title.x = element_text(size = 16),
axis.title.y = element_text(size = 16),
axis.text.x = element_text(
angle = 90,
vjust = 0.1,
hjust = 1 ,
size = 18 , face="bold"
) ,
axis.text.y = element_text(
angle = 0,
vjust = 1,
hjust = 0.1 ,
size = 18 , face="bold"
))
# add logo to my plot
ggdraw(myplot) +
draw_image(logo_file, x = 1, y = 1, hjust = 1,
vjust = 1, width = 0.13, height = 0.2)
############################################################################
############################################################################
### ###
### Interpertation ###
### ###
############################################################################
############################################################################
# Note that the change in probabilities is not constant -
# the curve rises after 1000 , then more quickly in the middle, then levels out at
# the end.
# The difference in probabilities between 0 and 1000 is far less than the
# difference in probabilities between 1000 and 2000 or after 2000.
train <- train_res %>%
mutate(pred_default = ifelse(pred < 0.5, 0, 1))
head(train)
test <- test %>%
add_predictions(model_glm) %>%
mutate(pred = logit2prob(pred)) %>%
mutate(pred_default = ifelse(pred < 0.5, 0, 1))
head(test)
print("Our test error for model 1")
## [1] "Our test error for model 1"
mean(test$pred_default != test$default)
## [1] 0.02871537
print("Our train error for model 1")
## [1] "Our train error for model 1"
mean(train$pred_default != train$default)
## [1] 0.02632564
second logistic regression model using only income as a predictor and compare its error rate to the full model
mod2 <- glm(default ~ income, family="binomial", data=train_res)
mod2
##
## Call: glm(formula = default ~ income, family = "binomial", data = train_res)
##
## Coefficients:
## (Intercept) income
## -3.158e+00 -7.312e-06
##
## Degrees of Freedom: 8014 Total (i.e. Null); 8013 Residual
## Null Deviance: 2287
## Residual Deviance: 2285 AIC: 2289
train <- train %>% add_predictions(mod2) %>%
mutate(pred = logit2prob(pred)) %>%
mutate(pred_default = ifelse(pred < 0.5, 0, 1))
head(train)
test <- test %>% add_predictions(mod2) %>%
mutate(pred = logit2prob(pred)) %>%
mutate(pred_default = ifelse(pred < 0.5, 0, 1))
head(test)
print("Our test error for model 2")
## [1] "Our test error for model 2"
mean(test$pred_default != test$default)
## [1] 0.0372796
print("Our train error for model 2")
## [1] "Our train error for model 2"
mean(train$pred_default != train_res$default)
## [1] 0.03231441
summary(model_glm)
##
## Call:
## glm(formula = default ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5262 -0.1356 -0.0530 -0.0186 3.7666
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.125e+01 5.679e-01 -19.809 <2e-16 ***
## studentYes -5.813e-01 2.656e-01 -2.189 0.0286 *
## balance 5.874e-03 2.692e-04 21.819 <2e-16 ***
## income 7.023e-06 9.144e-06 0.768 0.4425
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2287.4 on 8014 degrees of freedom
## Residual deviance: 1220.8 on 8011 degrees of freedom
## AIC: 1228.8
##
## Number of Fisher Scoring iterations: 8
y = the predicted value of the dependent variable
B0 = the y-intercept (value of y when all other parameters are set to 0)
B1X1= the regression coefficient (B1) of the first independent variable (X1) (a.k.a. the effect that increasing the value of the independent variable has on the predicted y value)
… = do the same for however many independent variables you are testing
BnXn = the regression coefficient of the last independent variable
e = model error (a.k.a. how much variation there is in our estimate of y)
our formual.
credit debt= -5.813e-01(studentYes)+ 5.874e-03(balance) + 7.023e-06(income)+ -1.125e+01 + e
As we can see above result, we have intercept -1.125e+01 . only p-value compare to all response variables are very height for income which is 0.4425.
overall based on the p-value, income does not have insignificant.
all over variable are much better, like p-value(studentYes) =0.0286 with the the log odds of -5.813e-01 ,
a one unit increase in the predictor variable studentYes is associated with an average change of -5.813e-01 in the log odds of the response variable am taking on a value of $1. This means that higher values of studentYes are associated with a lower likelihood of the am variable taking on a value of $1.
in addition, for balance , we have p-value(balance) <2e-16 and the log odds is 5.874e-03 .
The log-odds function of probabilities is often used in state estimation algorithms because of its numerical advantages in the case of small probabilities.
as we can see the formula, students will decreasing(negative) with -5.813e-01, in other hand other variables are increasing(positive) by 1 dollar.
It can be seen that only 2 out of the 3 predictors are significantly associated to the outcome. These include: studentYes p-value = 0.0286 and balance < 2e-16 .
the z value for the studentYes variable disp is calculated as -5.812e-01 / 2.656e-01 = -2.18825301 the z value for the balance variable disp is calculated as 5.874e-03 / 2.692e-04
The coefficient estimate of the variable is b = -1.125e+01 , which is Negative This means that an increase in variable is associated with increase in the probability of being negative.
It can be seen that, changing in studentYes and balance are significantly associated to changes in credit debt while changes in income budget is not significantly associated with credit debt.
we can also see, based on boxplot Plot 4, people with no credits do not like to be as defualt compare to student with credit. as a result, we will see student would like to not spend money because of being higher remaing balance in the plot 6.
In machine learning, there are two important concepts: the training error and the test error.
Training Error: We get the by calculating the classification error of a model on the same data the model was trained on (just like the example above). Test Error: We get this by using two completely disjoint datasets: one to train the model and the other to calculate the classification error. Both datasets need to have values for y. The first dataset is called training data and the second, test data.
You get training error when you run the trained model against the training data. It is important to remember that this data was already used to train the model and this does not imply that the model will be accurate when applied to the training data again.
Source : https://rapidminer.com/blog/validate-models-training-test-error/
in our test error result with 0.02871537 for all data prediction which is great.
1985 * 0.02871537 = 57.0000095 ~ 57 error occurs in the test model 1.
to compare with second model that I run, which we have 0.02632564.
1985 * 0.0372796 = 74.000006 ~ 74 mistake occurs in the test model 2.
Also, error rate for training model 1 is 0.026875.
8015 * 0.02632564 = 211.000005 error found in traNING MODEL 1 .
traning error rate for second model is 0.032625.
8015 * 0.03231441= 258.999996 ~259 error found in traNING MODEL 2.
All in all, we have less error for model 1 and less error for training for model one.
so, model 1 is WINNER.
options(warn=-1) # remove warning with -1, show waring with 0
pacman::p_load(dplyr, tidyr ,cowplot
, tidyverse , viridis , GGally)
data <- read_csv("evals-mod.csv")
## Rows: 463 Columns: 18
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (7): rank, ethnicity, gender, language, cls_level, cls_profs, cls_credits
## dbl (11): score, age, cls_perc_eval, cls_did_eval, cls_students, bty_f1lower...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(data) # we have these cols "default" "student" "balance" "income"
## [1] "score" "rank" "ethnicity" "gender"
## [5] "language" "age" "cls_perc_eval" "cls_did_eval"
## [9] "cls_students" "cls_level" "cls_profs" "cls_credits"
## [13] "bty_f1lower" "bty_f1upper" "bty_f2upper" "bty_m1lower"
## [17] "bty_m1upper" "bty_m2upper"
head(data) # shows head of data
dim(data) # rows= 10000 col= 4
## [1] 463 18
# overview of our data
options(warn=-1) # remove warning with -1, show waring with 0
ggpairs(data, title="correlogram with ggpairs()" , colour = "gender") +
theme(panel.grid.major = element_blank())
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
I learned what is factor and how to use it
source : https://www.youtube.com/watch?v=xkRBfy8_2MU
set.seed(12345) # provided you use the same pseudo-random number generator
data <- read_csv("evals-mod.csv")
## Rows: 463 Columns: 18
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (7): rank, ethnicity, gender, language, cls_level, cls_profs, cls_credits
## dbl (11): score, age, cls_perc_eval, cls_did_eval, cls_students, bty_f1lower...
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
names(data)
## [1] "score" "rank" "ethnicity" "gender"
## [5] "language" "age" "cls_perc_eval" "cls_did_eval"
## [9] "cls_students" "cls_level" "cls_profs" "cls_credits"
## [13] "bty_f1lower" "bty_f1upper" "bty_f2upper" "bty_m1lower"
## [17] "bty_m1upper" "bty_m2upper"
library(broom)
##
## Attaching package: 'broom'
##
## The following object is masked from 'package:modelr':
##
## bootstrap
# we will build the best possible model predicting
# the average teacher evaluation score using some combination of rank, ethnicity, gender, language,
# age, cls_perc_eval, cls_did_eval, cls_students, cls_level, cls_profs, cls_credits, bty_avg
data$bty_average <- select(data, starts_with("bty_")) %>% rowMeans()
data <- data %>% select(-ends_with("lower"), -ends_with("upper"))
names(data)
## [1] "score" "rank" "ethnicity" "gender"
## [5] "language" "age" "cls_perc_eval" "cls_did_eval"
## [9] "cls_students" "cls_level" "cls_profs" "cls_credits"
## [13] "bty_average"
#--------------------------------------------------------
## Part a: Split data into 80% training and 20% test set
set.seed(12345)
index <- runif(nrow(data))
data$type <- ifelse(index < 0.80, "train", "test")
train <- data %>% filter(type == "train") %>%select(-type)
test <- data %>% filter(type == "test") %>% select(-type)
dim(train)
## [1] 360 13
names(data)
## [1] "score" "rank" "ethnicity" "gender"
## [5] "language" "age" "cls_perc_eval" "cls_did_eval"
## [9] "cls_students" "cls_level" "cls_profs" "cls_credits"
## [13] "bty_average" "type"
options(warn=-1) # remove warning with -1, show waring with 0
ggplot(data) + geom_histogram(aes(x=score , bins=30) )
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#################################################################
## Part 1 is done! ##
#################################################################
As long as the variable has very low R^2 adjusted and large p-value, we can expect to be the worst predictor of evaluation scores.
as you can see , we have cls_profs with R^2 adjusted is -0.002291 and p-value 0.672, so, we can defenttly say cls_profs is the worst predictor of evaluation scores.
# rank
model_rank <- lm(score ~ rank, data = train) # male as the reference
summary(model_rank)
##
## Call:
## lm(formula = score ~ rank, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.81707 -0.32687 0.08627 0.38961 0.87313
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.31039 0.06163 69.936 <2e-16 ***
## ranktenure track -0.19332 0.08582 -2.252 0.0249 *
## ranktenured -0.18352 0.07248 -2.532 0.0118 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5408 on 357 degrees of freedom
## Multiple R-squared: 0.01979, Adjusted R-squared: 0.01429
## F-statistic: 3.603 on 2 and 357 DF, p-value: 0.02823
#ethnicity
model_ethnicity <- lm(score ~ ethnicity, data = train) # male as the reference
summary(model_ethnicity)
##
## Call:
## lm(formula = score ~ ethnicity, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8802 -0.3802 0.1198 0.4198 0.9447
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.05532 0.07933 51.119 <2e-16 ***
## ethnicitynot minority 0.12487 0.08508 1.468 0.143
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5439 on 358 degrees of freedom
## Multiple R-squared: 0.005981, Adjusted R-squared: 0.003205
## F-statistic: 2.154 on 1 and 358 DF, p-value: 0.1431
# gender
model_gender <- lm(score ~ gender, data = train) # male as the reference
summary(model_gender)
##
## Call:
## lm(formula = score ~ gender, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.82783 -0.33895 0.07217 0.42770 0.82770
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.07230 0.04439 91.731 < 2e-16 ***
## gendermale 0.15553 0.05785 2.689 0.00751 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5401 on 358 degrees of freedom
## Multiple R-squared: 0.01979, Adjusted R-squared: 0.01705
## F-statistic: 7.228 on 1 and 358 DF, p-value: 0.007512
# language
model_language <- lm(score ~ language, data = train) # male as the reference
summary(model_language)
##
## Call:
## lm(formula = score ~ language, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8794 -0.3199 0.1206 0.4206 0.8206
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.17941 0.02938 142.261 <2e-16 ***
## languagenon-english -0.27941 0.12464 -2.242 0.0256 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5417 on 358 degrees of freedom
## Multiple R-squared: 0.01384, Adjusted R-squared: 0.01109
## F-statistic: 5.025 on 1 and 358 DF, p-value: 0.02559
# age
model_age <- lm(score ~ age, data = train) # male as the reference
summary(model_age)
##
## Call:
## lm(formula = score ~ age, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.90236 -0.34448 0.08711 0.41867 0.88710
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.418117 0.143655 30.755 <2e-16 ***
## age -0.005262 0.002914 -1.806 0.0718 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.543 on 358 degrees of freedom
## Multiple R-squared: 0.009028, Adjusted R-squared: 0.00626
## F-statistic: 3.261 on 1 and 358 DF, p-value: 0.07177
# cls_perc_eval
model_cls_perc_eval <- lm(score ~ cls_perc_eval, data = train) # male as the reference
summary(model_cls_perc_eval)
##
## Call:
## lm(formula = score ~ cls_perc_eval, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.91969 -0.34406 0.08118 0.40326 1.05171
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.697766 0.126253 29.288 < 2e-16 ***
## cls_perc_eval 0.006263 0.001654 3.788 0.000178 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5349 on 358 degrees of freedom
## Multiple R-squared: 0.03853, Adjusted R-squared: 0.03584
## F-statistic: 14.35 on 1 and 358 DF, p-value: 0.0001784
# cls_did_eval
model_cls_did_eval <- lm(score ~ cls_did_eval, data = train) # male as the reference
summary(model_cls_did_eval)
##
## Call:
## lm(formula = score ~ cls_did_eval, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8422 -0.3498 0.1097 0.4286 0.8610
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.1341606 0.0368554 112.172 <2e-16 ***
## cls_did_eval 0.0008015 0.0006239 1.285 0.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5442 on 358 degrees of freedom
## Multiple R-squared: 0.004589, Adjusted R-squared: 0.001808
## F-statistic: 1.65 on 1 and 358 DF, p-value: 0.1998
#cls_students
model_cls_students <- lm(score ~ cls_students, data = train) # male as the reference
summary(model_cls_students)
##
## Call:
## lm(formula = score ~ cls_students, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8551 -0.3574 0.1247 0.4400 0.8451
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.1526869 0.0355180 116.918 <2e-16 ***
## cls_students 0.0001979 0.0003688 0.537 0.592
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5453 on 358 degrees of freedom
## Multiple R-squared: 0.000804, Adjusted R-squared: -0.001987
## F-statistic: 0.2881 on 1 and 358 DF, p-value: 0.5918
# cls_level
model_cls_level <- lm(score ~ cls_level, data = train) # male as the reference
summary(model_cls_level)
##
## Call:
## lm(formula = score ~ cls_level, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.83504 -0.33504 0.08254 0.40314 0.86496
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.21746 0.04847 87.012 <2e-16 ***
## cls_levelupper -0.08242 0.06012 -1.371 0.171
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5441 on 358 degrees of freedom
## Multiple R-squared: 0.005222, Adjusted R-squared: 0.002443
## F-statistic: 1.879 on 1 and 358 DF, p-value: 0.1713
# cls_profs
model_cls_profs <- lm(score ~ cls_profs, data = train) # male as the reference
summary(model_cls_profs)
##
## Call:
## lm(formula = score ~ cls_profs, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.8467 -0.3725 0.1275 0.4275 0.8533
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.17250 0.03520 118.526 <2e-16 ***
## cls_profssingle -0.02583 0.06097 -0.424 0.672
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5454 on 358 degrees of freedom
## Multiple R-squared: 0.0005012, Adjusted R-squared: -0.002291
## F-statistic: 0.1795 on 1 and 358 DF, p-value: 0.6721
# cls_credits
model_cls_credits <- lm(score ~ cls_credits, data = train) # male as the reference
summary(model_cls_credits)
##
## Call:
## lm(formula = score ~ cls_credits, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.83735 -0.33735 0.06265 0.36265 0.86265
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.13735 0.02898 142.769 < 2e-16 ***
## cls_creditsone credit 0.47765 0.12295 3.885 0.000122 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5344 on 358 degrees of freedom
## Multiple R-squared: 0.04045, Adjusted R-squared: 0.03777
## F-statistic: 15.09 on 1 and 358 DF, p-value: 0.0001219
# bty_average
model_bty_average <- lm(score ~ bty_average, data = train) # male as the reference
summary(model_bty_average)
##
## Call:
## lm(formula = score ~ bty_average, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.9134 -0.3582 0.1418 0.4080 0.9411
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.87139 0.08634 44.840 < 2e-16 ***
## bty_average 0.06619 0.01846 3.585 0.000384 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.536 on 358 degrees of freedom
## Multiple R-squared: 0.03466, Adjusted R-squared: 0.03196
## F-statistic: 12.85 on 1 and 358 DF, p-value: 0.0003837
names(train)
## [1] "score" "rank" "ethnicity" "gender"
## [5] "language" "age" "cls_perc_eval" "cls_did_eval"
## [9] "cls_students" "cls_level" "cls_profs" "cls_credits"
## [13] "bty_average"
library(sjPlot)
## Registered S3 method overwritten by 'parameters':
## method from
## format.parameters_distribution datawizard
## Learn more about sjPlot with 'browseVignettes("sjPlot")'.
##
## Attaching package: 'sjPlot'
## The following objects are masked from 'package:cowplot':
##
## plot_grid, save_plot
library(sjmisc)
##
## Attaching package: 'sjmisc'
## The following object is masked from 'package:purrr':
##
## is_empty
## The following object is masked from 'package:tibble':
##
## add_case
## The following object is masked from 'package:tidyr':
##
## replace_na
library(sjlabelled)
##
## Attaching package: 'sjlabelled'
## The following object is masked from 'package:forcats':
##
## as_factor
## The following object is masked from 'package:ggplot2':
##
## as_label
## The following object is masked from 'package:dplyr':
##
## as_label
# Make a regression table https://www.rdocumentation.org/packages/sjPlot/versions/2.8.10/topics/tab_model
tab<- tab_model(model_rank , model_ethnicity , model_gender , model_language , model_age,
model_cls_perc_eval , model_cls_did_eval , model_cls_students,
model_cls_level , model_cls_profs , model_cls_credits, model_bty_average, digits = 5,
dv.labels = c("model_rank","model_ethnicity" , "model_gender" , "model_language" , "model_age",
"model_cls_perc_eval" , "model_cls_did_eval" , "model_cls_students",
"model_cls_level" , "model_cls_profs" , "model_cls_credits", "model_bty_average"),
title = "evals-mod", show.p=T,
show.se = T, show.ci = T, collapse.se = T, p.style = "stars" ,
show.aic = T,
show.fstat = T,
show.r2 = T , file="output.html")
tab$page.complete <- gsub("adjusted","adjusted or conditional",
tab$page.complete)
tab
| model_rank | model_ethnicity | model_gender | model_language | model_age | model_cls_perc_eval | model_cls_did_eval | model_cls_students | model_cls_level | model_cls_profs | model_cls_credits | model_bty_average | |||||||||||||
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Predictors | Estimates | CI | Estimates | CI | Estimates | CI | Estimates | CI | Estimates | CI | Estimates | CI | Estimates | CI | Estimates | CI | Estimates | CI | Estimates | CI | Estimates | CI | Estimates | CI |
| (Intercept) |
4.31039 *** (0.06163) |
-Inf – Inf |
4.05532 *** (0.07933) |
-Inf – Inf |
4.07230 *** (0.04439) |
-Inf – Inf |
4.17941 *** (0.02938) |
-Inf – Inf |
4.41812 *** (0.14366) |
-Inf – Inf |
3.69777 *** (0.12625) |
-Inf – Inf |
4.13416 *** (0.03686) |
-Inf – Inf |
4.15269 *** (0.03552) |
-Inf – Inf |
4.21746 *** (0.04847) |
-Inf – Inf |
4.17250 *** (0.03520) |
-Inf – Inf |
4.13735 *** (0.02898) |
-Inf – Inf |
3.87139 *** (0.08634) |
-Inf – Inf |
| rank [tenure track] |
-0.19332 * (0.08582) |
-Inf – Inf | ||||||||||||||||||||||
| rank [tenured] |
-0.18352 * (0.07248) |
-Inf – Inf | ||||||||||||||||||||||
| ethnicity [not minority] |
0.12487 (0.08508) |
-Inf – Inf | ||||||||||||||||||||||
| gender [male] |
0.15553 ** (0.05785) |
-Inf – Inf | ||||||||||||||||||||||
| language [non-english] |
-0.27941 * (0.12464) |
-Inf – Inf | ||||||||||||||||||||||
| age |
-0.00526 (0.00291) |
-Inf – Inf | ||||||||||||||||||||||
| cls perc eval |
0.00626 *** (0.00165) |
-Inf – Inf | ||||||||||||||||||||||
| cls did eval |
0.00080 (0.00062) |
-Inf – Inf | ||||||||||||||||||||||
| cls students |
0.00020 (0.00037) |
-Inf – Inf | ||||||||||||||||||||||
| cls level [upper] |
-0.08242 (0.06012) |
-Inf – Inf | ||||||||||||||||||||||
| cls profs [single] |
-0.02583 (0.06097) |
-Inf – Inf | ||||||||||||||||||||||
| cls credits [one credit] |
0.47765 *** (0.12295) |
-Inf – Inf | ||||||||||||||||||||||
| bty average |
0.06619 *** (0.01846) |
-Inf – Inf | ||||||||||||||||||||||
| Observations | 360 | 360 | 360 | 360 | 360 | 360 | 360 | 360 | 360 | 360 | 360 | 360 | ||||||||||||
| R2 / R2 adjusted | 0.020 / 0.014 | 0.006 / 0.003 | 0.020 / 0.017 | 0.014 / 0.011 | 0.009 / 0.006 | 0.039 / 0.036 | 0.005 / 0.002 | 0.001 / -0.002 | 0.005 / 0.002 | 0.001 / -0.002 | 0.040 / 0.038 | 0.035 / 0.032 | ||||||||||||
| AIC | 584.079 | 587.114 | 582.078 | 584.256 | 586.009 | 575.129 | 587.618 | 588.984 | 587.389 | 589.093 | 574.408 | 576.576 | ||||||||||||
|
||||||||||||||||||||||||
library(GGally)
ggpairs(train)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
OUR RESULT SHOWS THAT WE HAVE 7 TERMS AS VARIBALES INCULDES: cls_credits , bty_average , gender , cls_perc_eval, ethnicity, language, age. Based on question that asked, we can ignore cls_did_eval because i can see both have same measures cls_perc_eval also, we can see there are higher correlation wtih cls_student.
Call: lm(formula = score ~ cls_did_eval, data = train)
Residuals: Min 1Q Median 3Q Max -1.8422 -0.3498 0.1097 0.4286 0.8610
Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.1341606 0.0368554 112.172 <2e-16 *** cls_did_eval
0.0008015 0.0006239 1.285 0.2
— Signif. codes: 0 ‘’ 0.001 ‘’ 0.01 ‘’ 0.05
‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.5442 on 358 degrees of freedom Multiple R-squared: 0.004589, Adjusted R-squared: 0.001808 F-statistic: 1.65 on 1 and 358 DF, p-value: 0.1998
Call: lm(formula = score ~ cls_perc_eval, data = train)
Residuals: Min 1Q Median 3Q Max -1.91969 -0.34406 0.08118 0.40326 1.05171
Coefficients: Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.697766 0.126253 29.288 < 2e-16
cls_perc_eval 0.006263 0.001654 3.788 0.000178 — Signif.
codes: 0 ‘’ 0.001 ‘’ 0.01 ‘’ 0.05 ‘.’ 0.1 ‘ ’
1
Residual standard error: 0.5349 on 358 degrees of freedom Multiple R-squared: 0.03853, Adjusted R-squared: 0.03584 F-statistic: 14.35 on 1 and 358 DF, p-value: 0.0001784
I will try to use stepwise regression by AIC. reason: 1. It is easy to apply 2. It improves model generalizability 3. It yields a simple model that is easy to interpret 4. It is objective and reproducible
our final model is here
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.413737 0.228366 14.949 < 2e-16 ethnicitynot minority
0.198474 0.085403 2.324 0.020696
gendermale 0.201301 0.057043 3.529 0.000473 languagenon-english
-0.220350 0.120603 -1.827 0.068537 .
age -0.004885 0.002943 -1.660 0.097816 .
cls_perc_eval 0.005436 0.001609 3.378 0.000810 cls_creditsone credit
0.491295 0.120843 4.066 5.91e-05 bty_average 0.062328 0.018374 3.392
0.000772
Our Model= 3.41 + 0.198474 * [ethnicity not minority] + 0.201301 * [gendermale] + -0.220350* [languagenon-english] -0.004885 * [age] + 0.005436 * [cls_perc_eval] + 0.491295 * [cls_creditsone credit] + 0.062328 * [bty_average] + 3.413737
y = the predicted value of the dependent variable
B0 = the y-intercept (value of y when all other parameters are set to 0)
B1X1= the regression coefficient (B1) of the first independent variable (X1) (a.k.a. the effect that increasing the value of the independent variable has on the predicted y value)
… = do the same for however many independent variables you are testing
BnXn = the regression coefficient of the last independent variable
e = model error (a.k.a. how much variation there is in our estimate of y)
############################################################################
### ###
### INTERPERTATION. ###
### ###
############################################################################
############################################################################
as long as age is -0.004885, if we have one year age increase, we will see decreasing in beauty by -0.004885. remember p-value is 0.097816 for age too.on the other hand, we have gendermale with 0.201301 and p-value 0.000473 . means male is 0.201301 score is higher then women too.
############################################################################
############################################################################
### ###
### Stepwise Regression. ###
### ###
############################################################################
############################################################################
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
##################################################################################
##################################################################################
### ###
### BUILD REGRESSION MODEL FROM A SET OF CANDIDATE PREDICTOR VARIABLES BY ###
### ENTERING AND REMOVING PREDICTORS BASED ON P VALUES, IN A STEPWISE MANNER ###
### UNTIL THERE IS NO VARIABLE LEFT TO ENTER OR REMOVE ANY MORE. THE MODEL ###
### SHOULD INCLUDE ALL THE CANDIDATE PREDICTOR VARIABLES. IF DETAILS IS SET ###
### TO TRUE, EACH STEP IS DISPLAYED.. ###
### ###
##################################################################################
##################################################################################
model <- lm(score ~ ., data = train)
ols_step_both_p(model)
##
## Stepwise Selection Summary
## ------------------------------------------------------------------------------------------
## Added/ Adj.
## Step Variable Removed R-Square R-Square C(p) AIC RMSE
## ------------------------------------------------------------------------------------------
## 1 cls_credits addition 0.040 0.038 45.3740 574.4081 0.5344
## 2 bty_average addition 0.080 0.075 30.7900 561.2173 0.5239
## 3 gender addition 0.104 0.097 22.7230 553.6736 0.5178
## 4 cls_perc_eval addition 0.128 0.118 14.8580 546.0691 0.5116
## 5 ethnicity addition 0.148 0.136 8.5540 539.7809 0.5065
## 6 language addition 0.156 0.141 7.2310 538.4109 0.5048
## 7 age addition 0.162 0.145 6.4880 537.6038 0.5036
## ------------------------------------------------------------------------------------------
k <- ols_step_both_p(model)
###########################################################################################
## The plot method shows the panel of fit criteria for Stepwise Regression methods. ##
###########################################################################################
plot(k)
############################################################################
############################################################################
### ###
### DETAILED OUTPUT ###
### ###
############################################################################
############################################################################
ols_step_both_p(model, details = TRUE)
## Stepwise Selection Method
## ---------------------------
##
## Candidate Terms:
##
## 1. rank
## 2. ethnicity
## 3. gender
## 4. language
## 5. age
## 6. cls_perc_eval
## 7. cls_did_eval
## 8. cls_students
## 9. cls_level
## 10. cls_profs
## 11. cls_credits
## 12. bty_average
##
## We are selecting variables based on p value...
##
##
## Stepwise Selection: Step 1
##
## - cls_credits added
##
## Model Summary
## --------------------------------------------------------------
## R 0.201 RMSE 0.534
## R-Squared 0.040 Coef. Var 12.833
## Adj. R-Squared 0.038 MSE 0.286
## Pred R-Squared 0.032 MAE 0.435
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------
## Regression 4.309 1 4.309 15.093 1e-04
## Residual 102.221 358 0.286
## Total 106.531 359
## --------------------------------------------------------------------
##
## Parameter Estimates
## ------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## ------------------------------------------------------------------------------------------------
## (Intercept) 4.137 0.029 142.769 0.000 4.080 4.194
## cls_creditsone credit 0.478 0.123 0.201 3.885 0.000 0.236 0.719
## ------------------------------------------------------------------------------------------------
##
##
##
## Stepwise Selection: Step 2
##
## - bty_average added
##
## Model Summary
## --------------------------------------------------------------
## R 0.283 RMSE 0.524
## R-Squared 0.080 Coef. Var 12.583
## Adj. R-Squared 0.075 MSE 0.275
## Pred R-Squared 0.066 MAE 0.431
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------
## Regression 8.533 2 4.267 15.543 0.0000
## Residual 97.997 357 0.275
## Total 106.531 359
## --------------------------------------------------------------------
##
## Parameter Estimates
## -----------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -----------------------------------------------------------------------------------------------
## (Intercept) 3.822 0.085 44.859 0.000 3.655 3.990
## cls_creditsone credit 0.507 0.121 0.214 4.199 0.000 0.270 0.745
## bty_average 0.071 0.018 0.200 3.923 0.000 0.035 0.106
## -----------------------------------------------------------------------------------------------
##
##
##
## Model Summary
## --------------------------------------------------------------
## R 0.283 RMSE 0.524
## R-Squared 0.080 Coef. Var 12.583
## Adj. R-Squared 0.075 MSE 0.275
## Pred R-Squared 0.066 MAE 0.431
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------
## Regression 8.533 2 4.267 15.543 0.0000
## Residual 97.997 357 0.275
## Total 106.531 359
## --------------------------------------------------------------------
##
## Parameter Estimates
## -----------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -----------------------------------------------------------------------------------------------
## (Intercept) 3.822 0.085 44.859 0.000 3.655 3.990
## cls_creditsone credit 0.507 0.121 0.214 4.199 0.000 0.270 0.745
## bty_average 0.071 0.018 0.200 3.923 0.000 0.035 0.106
## -----------------------------------------------------------------------------------------------
##
##
##
## Stepwise Selection: Step 3
##
## - gender added
##
## Model Summary
## --------------------------------------------------------------
## R 0.323 RMSE 0.518
## R-Squared 0.104 Coef. Var 12.434
## Adj. R-Squared 0.097 MSE 0.268
## Pred R-Squared 0.085 MAE 0.422
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------
## Regression 11.097 3 3.699 13.798 0.0000
## Residual 95.434 356 0.268
## Total 106.531 359
## --------------------------------------------------------------------
##
## Parameter Estimates
## -----------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -----------------------------------------------------------------------------------------------
## (Intercept) 3.692 0.094 39.180 0.000 3.506 3.877
## cls_creditsone credit 0.490 0.119 0.206 4.098 0.000 0.255 0.725
## bty_average 0.078 0.018 0.218 4.315 0.000 0.042 0.113
## gendermale 0.173 0.056 0.157 3.093 0.002 0.063 0.283
## -----------------------------------------------------------------------------------------------
##
##
##
## Model Summary
## --------------------------------------------------------------
## R 0.323 RMSE 0.518
## R-Squared 0.104 Coef. Var 12.434
## Adj. R-Squared 0.097 MSE 0.268
## Pred R-Squared 0.085 MAE 0.422
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------
## Regression 11.097 3 3.699 13.798 0.0000
## Residual 95.434 356 0.268
## Total 106.531 359
## --------------------------------------------------------------------
##
## Parameter Estimates
## -----------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -----------------------------------------------------------------------------------------------
## (Intercept) 3.692 0.094 39.180 0.000 3.506 3.877
## cls_creditsone credit 0.490 0.119 0.206 4.098 0.000 0.255 0.725
## bty_average 0.078 0.018 0.218 4.315 0.000 0.042 0.113
## gendermale 0.173 0.056 0.157 3.093 0.002 0.063 0.283
## -----------------------------------------------------------------------------------------------
##
##
##
## Stepwise Selection: Step 4
##
## - cls_perc_eval added
##
## Model Summary
## --------------------------------------------------------------
## R 0.357 RMSE 0.512
## R-Squared 0.128 Coef. Var 12.287
## Adj. R-Squared 0.118 MSE 0.262
## Pred R-Squared 0.105 MAE 0.415
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------
## Regression 13.609 4 3.402 12.998 0.0000
## Residual 92.921 355 0.262
## Total 106.531 359
## --------------------------------------------------------------------
##
## Parameter Estimates
## -----------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -----------------------------------------------------------------------------------------------
## (Intercept) 3.352 0.144 23.308 0.000 3.069 3.635
## cls_creditsone credit 0.438 0.119 0.184 3.672 0.000 0.203 0.672
## bty_average 0.069 0.018 0.193 3.804 0.000 0.033 0.104
## gendermale 0.187 0.055 0.169 3.372 0.001 0.078 0.296
## cls_perc_eval 0.005 0.002 0.158 3.098 0.002 0.002 0.008
## -----------------------------------------------------------------------------------------------
##
##
##
## Model Summary
## --------------------------------------------------------------
## R 0.357 RMSE 0.512
## R-Squared 0.128 Coef. Var 12.287
## Adj. R-Squared 0.118 MSE 0.262
## Pred R-Squared 0.105 MAE 0.415
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------
## Regression 13.609 4 3.402 12.998 0.0000
## Residual 92.921 355 0.262
## Total 106.531 359
## --------------------------------------------------------------------
##
## Parameter Estimates
## -----------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -----------------------------------------------------------------------------------------------
## (Intercept) 3.352 0.144 23.308 0.000 3.069 3.635
## cls_creditsone credit 0.438 0.119 0.184 3.672 0.000 0.203 0.672
## bty_average 0.069 0.018 0.193 3.804 0.000 0.033 0.104
## gendermale 0.187 0.055 0.169 3.372 0.001 0.078 0.296
## cls_perc_eval 0.005 0.002 0.158 3.098 0.002 0.002 0.008
## -----------------------------------------------------------------------------------------------
##
##
##
## Stepwise Selection: Step 5
##
## - ethnicity added
##
## Model Summary
## --------------------------------------------------------------
## R 0.384 RMSE 0.506
## R-Squared 0.148 Coef. Var 12.163
## Adj. R-Squared 0.136 MSE 0.257
## Pred R-Squared 0.120 MAE 0.406
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------
## Regression 15.724 5 3.145 12.26 0.0000
## Residual 90.806 354 0.257
## Total 106.531 359
## --------------------------------------------------------------------
##
## Parameter Estimates
## -----------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -----------------------------------------------------------------------------------------------
## (Intercept) 3.106 0.166 18.702 0.000 2.780 3.433
## cls_creditsone credit 0.516 0.121 0.217 4.257 0.000 0.277 0.754
## bty_average 0.070 0.018 0.196 3.913 0.000 0.035 0.105
## gendermale 0.172 0.055 0.155 3.114 0.002 0.063 0.280
## cls_perc_eval 0.006 0.002 0.174 3.432 0.001 0.002 0.009
## ethnicitynot minority 0.237 0.083 0.147 2.871 0.004 0.075 0.399
## -----------------------------------------------------------------------------------------------
##
##
##
## Model Summary
## --------------------------------------------------------------
## R 0.384 RMSE 0.506
## R-Squared 0.148 Coef. Var 12.163
## Adj. R-Squared 0.136 MSE 0.257
## Pred R-Squared 0.120 MAE 0.406
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------
## Regression 15.724 5 3.145 12.26 0.0000
## Residual 90.806 354 0.257
## Total 106.531 359
## --------------------------------------------------------------------
##
## Parameter Estimates
## -----------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -----------------------------------------------------------------------------------------------
## (Intercept) 3.106 0.166 18.702 0.000 2.780 3.433
## cls_creditsone credit 0.516 0.121 0.217 4.257 0.000 0.277 0.754
## bty_average 0.070 0.018 0.196 3.913 0.000 0.035 0.105
## gendermale 0.172 0.055 0.155 3.114 0.002 0.063 0.280
## cls_perc_eval 0.006 0.002 0.174 3.432 0.001 0.002 0.009
## ethnicitynot minority 0.237 0.083 0.147 2.871 0.004 0.075 0.399
## -----------------------------------------------------------------------------------------------
##
##
##
## Stepwise Selection: Step 6
##
## - language added
##
## Model Summary
## --------------------------------------------------------------
## R 0.394 RMSE 0.505
## R-Squared 0.156 Coef. Var 12.124
## Adj. R-Squared 0.141 MSE 0.255
## Pred R-Squared 0.125 MAE 0.400
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------
## Regression 16.570 6 2.762 10.837 0.0000
## Residual 89.960 353 0.255
## Total 106.531 359
## --------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------
## (Intercept) 3.156 0.168 18.810 0.000 2.826 3.486
## cls_creditsone credit 0.500 0.121 0.210 4.129 0.000 0.262 0.738
## bty_average 0.070 0.018 0.197 3.944 0.000 0.035 0.105
## gendermale 0.176 0.055 0.159 3.190 0.002 0.067 0.284
## cls_perc_eval 0.006 0.002 0.173 3.420 0.001 0.002 0.009
## ethnicitynot minority 0.194 0.086 0.120 2.266 0.024 0.026 0.362
## languagenon-english -0.220 0.121 -0.093 -1.822 0.069 -0.458 0.017
## -------------------------------------------------------------------------------------------------
##
##
##
## Model Summary
## --------------------------------------------------------------
## R 0.394 RMSE 0.505
## R-Squared 0.156 Coef. Var 12.124
## Adj. R-Squared 0.141 MSE 0.255
## Pred R-Squared 0.125 MAE 0.400
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------
## Regression 16.570 6 2.762 10.837 0.0000
## Residual 89.960 353 0.255
## Total 106.531 359
## --------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------
## (Intercept) 3.156 0.168 18.810 0.000 2.826 3.486
## cls_creditsone credit 0.500 0.121 0.210 4.129 0.000 0.262 0.738
## bty_average 0.070 0.018 0.197 3.944 0.000 0.035 0.105
## gendermale 0.176 0.055 0.159 3.190 0.002 0.067 0.284
## cls_perc_eval 0.006 0.002 0.173 3.420 0.001 0.002 0.009
## ethnicitynot minority 0.194 0.086 0.120 2.266 0.024 0.026 0.362
## languagenon-english -0.220 0.121 -0.093 -1.822 0.069 -0.458 0.017
## -------------------------------------------------------------------------------------------------
##
##
##
## Stepwise Selection: Step 7
##
## - age added
##
## Model Summary
## --------------------------------------------------------------
## R 0.403 RMSE 0.504
## R-Squared 0.162 Coef. Var 12.094
## Adj. R-Squared 0.145 MSE 0.254
## Pred R-Squared 0.128 MAE 0.398
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------
## Regression 17.269 7 2.467 9.729 0.0000
## Residual 89.262 352 0.254
## Total 106.531 359
## -------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------
## (Intercept) 3.414 0.228 14.949 0.000 2.965 3.863
## cls_creditsone credit 0.491 0.121 0.207 4.066 0.000 0.254 0.729
## bty_average 0.062 0.018 0.175 3.392 0.001 0.026 0.098
## gendermale 0.201 0.057 0.182 3.529 0.000 0.089 0.313
## cls_perc_eval 0.005 0.002 0.170 3.378 0.001 0.002 0.009
## ethnicitynot minority 0.198 0.085 0.123 2.324 0.021 0.031 0.366
## languagenon-english -0.220 0.121 -0.093 -1.827 0.069 -0.458 0.017
## age -0.005 0.003 -0.088 -1.660 0.098 -0.011 0.001
## -------------------------------------------------------------------------------------------------
##
##
##
## Model Summary
## --------------------------------------------------------------
## R 0.403 RMSE 0.504
## R-Squared 0.162 Coef. Var 12.094
## Adj. R-Squared 0.145 MSE 0.254
## Pred R-Squared 0.128 MAE 0.398
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------
## Regression 17.269 7 2.467 9.729 0.0000
## Residual 89.262 352 0.254
## Total 106.531 359
## -------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------
## (Intercept) 3.414 0.228 14.949 0.000 2.965 3.863
## cls_creditsone credit 0.491 0.121 0.207 4.066 0.000 0.254 0.729
## bty_average 0.062 0.018 0.175 3.392 0.001 0.026 0.098
## gendermale 0.201 0.057 0.182 3.529 0.000 0.089 0.313
## cls_perc_eval 0.005 0.002 0.170 3.378 0.001 0.002 0.009
## ethnicitynot minority 0.198 0.085 0.123 2.324 0.021 0.031 0.366
## languagenon-english -0.220 0.121 -0.093 -1.827 0.069 -0.458 0.017
## age -0.005 0.003 -0.088 -1.660 0.098 -0.011 0.001
## -------------------------------------------------------------------------------------------------
##
##
##
## No more variables to be added/removed.
##
##
## Final Model Output
## ------------------
##
## Model Summary
## --------------------------------------------------------------
## R 0.403 RMSE 0.504
## R-Squared 0.162 Coef. Var 12.094
## Adj. R-Squared 0.145 MSE 0.254
## Pred R-Squared 0.128 MAE 0.398
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------
## Regression 17.269 7 2.467 9.729 0.0000
## Residual 89.262 352 0.254
## Total 106.531 359
## -------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------
## (Intercept) 3.414 0.228 14.949 0.000 2.965 3.863
## cls_creditsone credit 0.491 0.121 0.207 4.066 0.000 0.254 0.729
## bty_average 0.062 0.018 0.175 3.392 0.001 0.026 0.098
## gendermale 0.201 0.057 0.182 3.529 0.000 0.089 0.313
## cls_perc_eval 0.005 0.002 0.170 3.378 0.001 0.002 0.009
## ethnicitynot minority 0.198 0.085 0.123 2.324 0.021 0.031 0.366
## languagenon-english -0.220 0.121 -0.093 -1.827 0.069 -0.458 0.017
## age -0.005 0.003 -0.088 -1.660 0.098 -0.011 0.001
## -------------------------------------------------------------------------------------------------
##
## Stepwise Selection Summary
## ------------------------------------------------------------------------------------------
## Added/ Adj.
## Step Variable Removed R-Square R-Square C(p) AIC RMSE
## ------------------------------------------------------------------------------------------
## 1 cls_credits addition 0.040 0.038 45.3740 574.4081 0.5344
## 2 bty_average addition 0.080 0.075 30.7900 561.2173 0.5239
## 3 gender addition 0.104 0.097 22.7230 553.6736 0.5178
## 4 cls_perc_eval addition 0.128 0.118 14.8580 546.0691 0.5116
## 5 ethnicity addition 0.148 0.136 8.5540 539.7809 0.5065
## 6 language addition 0.156 0.141 7.2310 538.4109 0.5048
## 7 age addition 0.162 0.145 6.4880 537.6038 0.5036
## ------------------------------------------------------------------------------------------
our final model is
Our Model= 3.41 + 0.198474 * [ethnicity not minority] + 0.201301 * [gendermale] + -0.220350* [languagenon-english] -0.004885 * [age] + 0.005436 * [cls_perc_eval] + 0.491295 * [cls_creditsone credit] + 0.062328 * [bty_average] + 3.413737
for our final model we should consider the all below variables characteristics of a professor and course that would be associated with a high evaluation score:
Gender (Gender of professor (collected as a binary variable at the time of the study): female, male. )the porfessor should be male because gendermale is 0.201 not female.
cls_creditsone credit -> singel credit not multiple
ethnicity (Ethnicity of professor: not minority, minority.) -> not minority
language (Language of school where professor received education: English or non-English.)-> english
age (Age of professor) 29 or 31 years old
bty_average(Average beauty rating of professor.) -> great
https://cran.r-project.org/web/packages/olsrr/vignettes/variable_selection.html
https://www.rdocumentation.org/packages/olsrr/versions/0.5.3
PLEASE REFER TO PART C QUESTION 2.
###########################################################################
###########################################################################
### ###
### STEPWISE FORWARD REGRESSION. ###
### ###
###########################################################################
###########################################################################
# stepwise forward regression
model <- lm(score ~ ., data = train)
##################################################################################
##################################################################################
### ###
### BUILD REGRESSION MODEL FROM A SET OF CANDIDATE PREDICTOR VARIABLES BY ###
### ENTERING PREDICTORS BASED ON P VALUES, IN A STEPWISE MANNER UNTIL THERE ###
### IS NO VARIABLE LEFT TO ENTER ANY MORE. THE MODEL SHOULD INCLUDE ALL THE ###
### CANDIDATE PREDICTOR VARIABLES. IF DETAILS IS SET TO TRUE, EACH STEP IS ###
### DISPLAYED. ###
### ###
##################################################################################
##################################################################################
ols_step_forward_p(model)
##
## Selection Summary
## ------------------------------------------------------------------------------
## Variable Adj.
## Step Entered R-Square R-Square C(p) AIC RMSE
## ------------------------------------------------------------------------------
## 1 cls_credits 0.0405 0.0378 45.3739 574.4081 0.5344
## 2 bty_average 0.0801 0.0749 30.7897 561.2173 0.5239
## 3 gender 0.1042 0.0966 22.7228 553.6736 0.5178
## 4 cls_perc_eval 0.1278 0.1179 14.8576 546.0691 0.5116
## 5 ethnicity 0.1476 0.1356 8.5536 539.7809 0.5065
## 6 language 0.1555 0.1412 7.2315 538.4109 0.5048
## 7 age 0.1621 0.1454 6.4879 537.6038 0.5036
## 8 cls_did_eval 0.1667 0.1477 6.5770 537.6357 0.5029
## ------------------------------------------------------------------------------
k <- ols_step_forward_p(model)
plot(k)
### DETAILED OUTPUT
############################################################################
############################################################################
### ###
### DETAILED OUTPUT ###
### ###
############################################################################
############################################################################
ols_step_forward_p(model, details = TRUE)
## Forward Selection Method
## ---------------------------
##
## Candidate Terms:
##
## 1. rank
## 2. ethnicity
## 3. gender
## 4. language
## 5. age
## 6. cls_perc_eval
## 7. cls_did_eval
## 8. cls_students
## 9. cls_level
## 10. cls_profs
## 11. cls_credits
## 12. bty_average
##
## We are selecting variables based on p value...
##
##
## Forward Selection: Step 1
##
## - cls_credits
##
## Model Summary
## --------------------------------------------------------------
## R 0.201 RMSE 0.534
## R-Squared 0.040 Coef. Var 12.833
## Adj. R-Squared 0.038 MSE 0.286
## Pred R-Squared 0.032 MAE 0.435
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------
## Regression 4.309 1 4.309 15.093 1e-04
## Residual 102.221 358 0.286
## Total 106.531 359
## --------------------------------------------------------------------
##
## Parameter Estimates
## ------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## ------------------------------------------------------------------------------------------------
## (Intercept) 4.137 0.029 142.769 0.000 4.080 4.194
## cls_creditsone credit 0.478 0.123 0.201 3.885 0.000 0.236 0.719
## ------------------------------------------------------------------------------------------------
##
##
##
## Forward Selection: Step 2
##
## - bty_average
##
## Model Summary
## --------------------------------------------------------------
## R 0.283 RMSE 0.524
## R-Squared 0.080 Coef. Var 12.583
## Adj. R-Squared 0.075 MSE 0.275
## Pred R-Squared 0.066 MAE 0.431
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------
## Regression 8.533 2 4.267 15.543 0.0000
## Residual 97.997 357 0.275
## Total 106.531 359
## --------------------------------------------------------------------
##
## Parameter Estimates
## -----------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -----------------------------------------------------------------------------------------------
## (Intercept) 3.822 0.085 44.859 0.000 3.655 3.990
## cls_creditsone credit 0.507 0.121 0.214 4.199 0.000 0.270 0.745
## bty_average 0.071 0.018 0.200 3.923 0.000 0.035 0.106
## -----------------------------------------------------------------------------------------------
##
##
##
## Forward Selection: Step 3
##
## - gender
##
## Model Summary
## --------------------------------------------------------------
## R 0.323 RMSE 0.518
## R-Squared 0.104 Coef. Var 12.434
## Adj. R-Squared 0.097 MSE 0.268
## Pred R-Squared 0.085 MAE 0.422
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------
## Regression 11.097 3 3.699 13.798 0.0000
## Residual 95.434 356 0.268
## Total 106.531 359
## --------------------------------------------------------------------
##
## Parameter Estimates
## -----------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -----------------------------------------------------------------------------------------------
## (Intercept) 3.692 0.094 39.180 0.000 3.506 3.877
## cls_creditsone credit 0.490 0.119 0.206 4.098 0.000 0.255 0.725
## bty_average 0.078 0.018 0.218 4.315 0.000 0.042 0.113
## gendermale 0.173 0.056 0.157 3.093 0.002 0.063 0.283
## -----------------------------------------------------------------------------------------------
##
##
##
## Forward Selection: Step 4
##
## - cls_perc_eval
##
## Model Summary
## --------------------------------------------------------------
## R 0.357 RMSE 0.512
## R-Squared 0.128 Coef. Var 12.287
## Adj. R-Squared 0.118 MSE 0.262
## Pred R-Squared 0.105 MAE 0.415
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------
## Regression 13.609 4 3.402 12.998 0.0000
## Residual 92.921 355 0.262
## Total 106.531 359
## --------------------------------------------------------------------
##
## Parameter Estimates
## -----------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -----------------------------------------------------------------------------------------------
## (Intercept) 3.352 0.144 23.308 0.000 3.069 3.635
## cls_creditsone credit 0.438 0.119 0.184 3.672 0.000 0.203 0.672
## bty_average 0.069 0.018 0.193 3.804 0.000 0.033 0.104
## gendermale 0.187 0.055 0.169 3.372 0.001 0.078 0.296
## cls_perc_eval 0.005 0.002 0.158 3.098 0.002 0.002 0.008
## -----------------------------------------------------------------------------------------------
##
##
##
## Forward Selection: Step 5
##
## - ethnicity
##
## Model Summary
## --------------------------------------------------------------
## R 0.384 RMSE 0.506
## R-Squared 0.148 Coef. Var 12.163
## Adj. R-Squared 0.136 MSE 0.257
## Pred R-Squared 0.120 MAE 0.406
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------
## Regression 15.724 5 3.145 12.26 0.0000
## Residual 90.806 354 0.257
## Total 106.531 359
## --------------------------------------------------------------------
##
## Parameter Estimates
## -----------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -----------------------------------------------------------------------------------------------
## (Intercept) 3.106 0.166 18.702 0.000 2.780 3.433
## cls_creditsone credit 0.516 0.121 0.217 4.257 0.000 0.277 0.754
## bty_average 0.070 0.018 0.196 3.913 0.000 0.035 0.105
## gendermale 0.172 0.055 0.155 3.114 0.002 0.063 0.280
## cls_perc_eval 0.006 0.002 0.174 3.432 0.001 0.002 0.009
## ethnicitynot minority 0.237 0.083 0.147 2.871 0.004 0.075 0.399
## -----------------------------------------------------------------------------------------------
##
##
##
## Forward Selection: Step 6
##
## - language
##
## Model Summary
## --------------------------------------------------------------
## R 0.394 RMSE 0.505
## R-Squared 0.156 Coef. Var 12.124
## Adj. R-Squared 0.141 MSE 0.255
## Pred R-Squared 0.125 MAE 0.400
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## --------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## --------------------------------------------------------------------
## Regression 16.570 6 2.762 10.837 0.0000
## Residual 89.960 353 0.255
## Total 106.531 359
## --------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------
## (Intercept) 3.156 0.168 18.810 0.000 2.826 3.486
## cls_creditsone credit 0.500 0.121 0.210 4.129 0.000 0.262 0.738
## bty_average 0.070 0.018 0.197 3.944 0.000 0.035 0.105
## gendermale 0.176 0.055 0.159 3.190 0.002 0.067 0.284
## cls_perc_eval 0.006 0.002 0.173 3.420 0.001 0.002 0.009
## ethnicitynot minority 0.194 0.086 0.120 2.266 0.024 0.026 0.362
## languagenon-english -0.220 0.121 -0.093 -1.822 0.069 -0.458 0.017
## -------------------------------------------------------------------------------------------------
##
##
##
## Forward Selection: Step 7
##
## - age
##
## Model Summary
## --------------------------------------------------------------
## R 0.403 RMSE 0.504
## R-Squared 0.162 Coef. Var 12.094
## Adj. R-Squared 0.145 MSE 0.254
## Pred R-Squared 0.128 MAE 0.398
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------
## Regression 17.269 7 2.467 9.729 0.0000
## Residual 89.262 352 0.254
## Total 106.531 359
## -------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------
## (Intercept) 3.414 0.228 14.949 0.000 2.965 3.863
## cls_creditsone credit 0.491 0.121 0.207 4.066 0.000 0.254 0.729
## bty_average 0.062 0.018 0.175 3.392 0.001 0.026 0.098
## gendermale 0.201 0.057 0.182 3.529 0.000 0.089 0.313
## cls_perc_eval 0.005 0.002 0.170 3.378 0.001 0.002 0.009
## ethnicitynot minority 0.198 0.085 0.123 2.324 0.021 0.031 0.366
## languagenon-english -0.220 0.121 -0.093 -1.827 0.069 -0.458 0.017
## age -0.005 0.003 -0.088 -1.660 0.098 -0.011 0.001
## -------------------------------------------------------------------------------------------------
##
##
##
## Forward Selection: Step 8
##
## - cls_did_eval
##
## Model Summary
## --------------------------------------------------------------
## R 0.408 RMSE 0.503
## R-Squared 0.167 Coef. Var 12.078
## Adj. R-Squared 0.148 MSE 0.253
## Pred R-Squared 0.128 MAE 0.395
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------
## Regression 17.756 8 2.219 8.775 0.0000
## Residual 88.775 351 0.253
## Total 106.531 359
## -------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------
## (Intercept) 3.370 0.230 14.642 0.000 2.918 3.823
## cls_creditsone credit 0.501 0.121 0.211 4.143 0.000 0.263 0.739
## bty_average 0.058 0.019 0.163 3.101 0.002 0.021 0.094
## gendermale 0.193 0.057 0.174 3.361 0.001 0.080 0.305
## cls_perc_eval 0.006 0.002 0.187 3.609 0.000 0.003 0.009
## ethnicitynot minority 0.194 0.085 0.120 2.277 0.023 0.026 0.362
## languagenon-english -0.210 0.121 -0.088 -1.738 0.083 -0.447 0.028
## age -0.005 0.003 -0.088 -1.654 0.099 -0.011 0.001
## cls_did_eval 0.001 0.001 0.071 1.387 0.166 0.000 0.002
## -------------------------------------------------------------------------------------------------
##
##
##
## No more variables to be added.
##
## Variables Entered:
##
## + cls_credits
## + bty_average
## + gender
## + cls_perc_eval
## + ethnicity
## + language
## + age
## + cls_did_eval
##
##
## Final Model Output
## ------------------
##
## Model Summary
## --------------------------------------------------------------
## R 0.408 RMSE 0.503
## R-Squared 0.167 Coef. Var 12.078
## Adj. R-Squared 0.148 MSE 0.253
## Pred R-Squared 0.128 MAE 0.395
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------
## Regression 17.756 8 2.219 8.775 0.0000
## Residual 88.775 351 0.253
## Total 106.531 359
## -------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------
## (Intercept) 3.370 0.230 14.642 0.000 2.918 3.823
## cls_creditsone credit 0.501 0.121 0.211 4.143 0.000 0.263 0.739
## bty_average 0.058 0.019 0.163 3.101 0.002 0.021 0.094
## gendermale 0.193 0.057 0.174 3.361 0.001 0.080 0.305
## cls_perc_eval 0.006 0.002 0.187 3.609 0.000 0.003 0.009
## ethnicitynot minority 0.194 0.085 0.120 2.277 0.023 0.026 0.362
## languagenon-english -0.210 0.121 -0.088 -1.738 0.083 -0.447 0.028
## age -0.005 0.003 -0.088 -1.654 0.099 -0.011 0.001
## cls_did_eval 0.001 0.001 0.071 1.387 0.166 0.000 0.002
## -------------------------------------------------------------------------------------------------
##
## Selection Summary
## ------------------------------------------------------------------------------
## Variable Adj.
## Step Entered R-Square R-Square C(p) AIC RMSE
## ------------------------------------------------------------------------------
## 1 cls_credits 0.0405 0.0378 45.3739 574.4081 0.5344
## 2 bty_average 0.0801 0.0749 30.7897 561.2173 0.5239
## 3 gender 0.1042 0.0966 22.7228 553.6736 0.5178
## 4 cls_perc_eval 0.1278 0.1179 14.8576 546.0691 0.5116
## 5 ethnicity 0.1476 0.1356 8.5536 539.7809 0.5065
## 6 language 0.1555 0.1412 7.2315 538.4109 0.5048
## 7 age 0.1621 0.1454 6.4879 537.6038 0.5036
## 8 cls_did_eval 0.1667 0.1477 6.5770 537.6357 0.5029
## ------------------------------------------------------------------------------
############################################################################
############################################################################
### ###
### STEPWISE BACKWARD REGRESSION. ###
### ###
############################################################################
############################################################################
# stepwise forward regression
model <- lm(score ~ ., data = train)
###################################################################################
###################################################################################
### ###
### BUILD REGRESSION MODEL FROM A SET OF CANDIDATE PREDICTOR VARIABLES BY ###
### REMOVING PREDICTORS BASED ON P VALUES, IN A STEPWISE MANNER UNTIL THERE ###
### IS NO VARIABLE LEFT TO REMOVE ANY MORE. THE MODEL SHOULD INCLUDE ALL THE ###
### CANDIDATE PREDICTOR VARIABLES. IF DETAILS IS SET TO TRUE, EACH STEP IS ###
### DISPLAYED. ###
### ###
###################################################################################
###################################################################################
ols_step_backward_p(model)
##
##
## Elimination Summary
## -----------------------------------------------------------------------------
## Variable Adj.
## Step Removed R-Square R-Square C(p) AIC RMSE
## -----------------------------------------------------------------------------
## 1 cls_profs 0.1728 0.1442 10.0291 542.9947 0.5039
## 2 cls_level 0.1727 0.1465 8.0595 541.0263 0.5032
## 3 cls_students 0.172 0.1483 6.3349 539.3127 0.5027
## 4 rank 0.1667 0.1477 6.5770 537.6357 0.5029
## -----------------------------------------------------------------------------
k <- ols_step_backward_p(model)
plot(k)
############################################################################
############################################################################
### ###
### DETAILED OUTPUT ###
### ###
############################################################################
############################################################################
ols_step_backward_p(model, details = TRUE)
## Backward Elimination Method
## ---------------------------
##
## Candidate Terms:
##
## 1 . rank
## 2 . ethnicity
## 3 . gender
## 4 . language
## 5 . age
## 6 . cls_perc_eval
## 7 . cls_did_eval
## 8 . cls_students
## 9 . cls_level
## 10 . cls_profs
## 11 . cls_credits
## 12 . bty_average
##
## We are eliminating variables based on p value...
##
## - cls_profs
##
## Backward Elimination: Step 1
##
## Variable cls_profs Removed
##
## Model Summary
## --------------------------------------------------------------
## R 0.416 RMSE 0.504
## R-Squared 0.173 Coef. Var 12.103
## Adj. R-Squared 0.144 MSE 0.254
## Pred R-Squared 0.118 MAE 0.397
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------
## Regression 18.405 12 1.534 6.039 0.0000
## Residual 88.126 347 0.254
## Total 106.531 359
## -------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------
## (Intercept) 3.629 0.295 12.291 0.000 3.048 4.209
## ranktenure track -0.139 0.096 -0.107 -1.443 0.150 -0.328 0.050
## ranktenured -0.080 0.076 -0.073 -1.053 0.293 -0.231 0.070
## ethnicitynot minority 0.173 0.087 0.107 1.972 0.049 0.000 0.345
## gendermale 0.203 0.059 0.184 3.439 0.001 0.087 0.319
## languagenon-english -0.179 0.127 -0.076 -1.414 0.158 -0.429 0.070
## age -0.007 0.004 -0.124 -1.956 0.051 -0.014 0.000
## cls_perc_eval 0.005 0.002 0.156 2.151 0.032 0.000 0.010
## cls_did_eval 0.002 0.003 0.211 0.746 0.456 -0.004 0.009
## cls_students -0.001 0.002 -0.144 -0.480 0.632 -0.005 0.003
## cls_levelupper 0.011 0.063 0.010 0.175 0.862 -0.114 0.136
## cls_creditsone credit 0.450 0.135 0.190 3.338 0.001 0.185 0.715
## bty_average 0.055 0.019 0.156 2.947 0.003 0.018 0.092
## -------------------------------------------------------------------------------------------------
##
##
## - cls_level
##
## Backward Elimination: Step 2
##
## Variable cls_level Removed
##
## Model Summary
## --------------------------------------------------------------
## R 0.416 RMSE 0.503
## R-Squared 0.173 Coef. Var 12.086
## Adj. R-Squared 0.147 MSE 0.253
## Pred R-Squared 0.123 MAE 0.397
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------
## Regression 18.397 11 1.672 6.604 0.0000
## Residual 88.134 348 0.253
## Total 106.531 359
## -------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------
## (Intercept) 3.632 0.294 12.352 0.000 3.054 4.211
## ranktenure track -0.137 0.096 -0.106 -1.435 0.152 -0.325 0.051
## ranktenured -0.079 0.076 -0.072 -1.041 0.299 -0.227 0.070
## ethnicitynot minority 0.175 0.087 0.108 2.017 0.044 0.004 0.345
## gendermale 0.202 0.059 0.183 3.440 0.001 0.087 0.318
## languagenon-english -0.177 0.126 -0.074 -1.405 0.161 -0.424 0.071
## age -0.007 0.003 -0.123 -1.951 0.052 -0.014 0.000
## cls_perc_eval 0.005 0.002 0.156 2.148 0.032 0.000 0.010
## cls_did_eval 0.003 0.003 0.219 0.786 0.433 -0.004 0.009
## cls_students -0.001 0.002 -0.154 -0.526 0.599 -0.005 0.003
## cls_creditsone credit 0.445 0.131 0.187 3.400 0.001 0.187 0.702
## bty_average 0.055 0.019 0.156 2.954 0.003 0.019 0.092
## -------------------------------------------------------------------------------------------------
##
##
## - cls_students
##
## Backward Elimination: Step 3
##
## Variable cls_students Removed
##
## Model Summary
## --------------------------------------------------------------
## R 0.415 RMSE 0.503
## R-Squared 0.172 Coef. Var 12.073
## Adj. R-Squared 0.148 MSE 0.253
## Pred R-Squared 0.125 MAE 0.396
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------
## Regression 18.327 10 1.833 7.251 0.0000
## Residual 88.204 349 0.253
## Total 106.531 359
## -------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------
## (Intercept) 3.569 0.268 13.309 0.000 3.042 4.097
## ranktenure track -0.140 0.095 -0.108 -1.469 0.143 -0.327 0.047
## ranktenured -0.079 0.076 -0.072 -1.052 0.294 -0.228 0.069
## ethnicitynot minority 0.179 0.086 0.111 2.085 0.038 0.010 0.348
## gendermale 0.198 0.058 0.180 3.405 0.001 0.084 0.313
## languagenon-english -0.171 0.125 -0.072 -1.367 0.173 -0.417 0.075
## age -0.007 0.003 -0.123 -1.948 0.052 -0.014 0.000
## cls_perc_eval 0.006 0.002 0.183 3.527 0.000 0.003 0.009
## cls_did_eval 0.001 0.001 0.075 1.434 0.152 0.000 0.002
## cls_creditsone credit 0.443 0.131 0.187 3.396 0.001 0.187 0.700
## bty_average 0.055 0.019 0.156 2.955 0.003 0.019 0.092
## -------------------------------------------------------------------------------------------------
##
##
## - rank
##
## Backward Elimination: Step 4
##
## Variable rank Removed
##
## Model Summary
## --------------------------------------------------------------
## R 0.408 RMSE 0.503
## R-Squared 0.167 Coef. Var 12.078
## Adj. R-Squared 0.148 MSE 0.253
## Pred R-Squared 0.128 MAE 0.395
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------
## Regression 17.756 8 2.219 8.775 0.0000
## Residual 88.775 351 0.253
## Total 106.531 359
## -------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------
## (Intercept) 3.370 0.230 14.642 0.000 2.918 3.823
## ethnicitynot minority 0.194 0.085 0.120 2.277 0.023 0.026 0.362
## gendermale 0.193 0.057 0.174 3.361 0.001 0.080 0.305
## languagenon-english -0.210 0.121 -0.088 -1.738 0.083 -0.447 0.028
## age -0.005 0.003 -0.088 -1.654 0.099 -0.011 0.001
## cls_perc_eval 0.006 0.002 0.187 3.609 0.000 0.003 0.009
## cls_did_eval 0.001 0.001 0.071 1.387 0.166 0.000 0.002
## cls_creditsone credit 0.501 0.121 0.211 4.143 0.000 0.263 0.739
## bty_average 0.058 0.019 0.163 3.101 0.002 0.021 0.094
## -------------------------------------------------------------------------------------------------
##
##
##
## No more variables satisfy the condition of p value = 0.3
##
##
## Variables Removed:
##
## - cls_profs
## - cls_level
## - cls_students
## - rank
##
##
## Final Model Output
## ------------------
##
## Model Summary
## --------------------------------------------------------------
## R 0.408 RMSE 0.503
## R-Squared 0.167 Coef. Var 12.078
## Adj. R-Squared 0.148 MSE 0.253
## Pred R-Squared 0.128 MAE 0.395
## --------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
##
## ANOVA
## -------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------
## Regression 17.756 8 2.219 8.775 0.0000
## Residual 88.775 351 0.253
## Total 106.531 359
## -------------------------------------------------------------------
##
## Parameter Estimates
## -------------------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## -------------------------------------------------------------------------------------------------
## (Intercept) 3.370 0.230 14.642 0.000 2.918 3.823
## ethnicitynot minority 0.194 0.085 0.120 2.277 0.023 0.026 0.362
## gendermale 0.193 0.057 0.174 3.361 0.001 0.080 0.305
## languagenon-english -0.210 0.121 -0.088 -1.738 0.083 -0.447 0.028
## age -0.005 0.003 -0.088 -1.654 0.099 -0.011 0.001
## cls_perc_eval 0.006 0.002 0.187 3.609 0.000 0.003 0.009
## cls_did_eval 0.001 0.001 0.071 1.387 0.166 0.000 0.002
## cls_creditsone credit 0.501 0.121 0.211 4.143 0.000 0.263 0.739
## bty_average 0.058 0.019 0.163 3.101 0.002 0.021 0.094
## -------------------------------------------------------------------------------------------------
##
##
## Elimination Summary
## -----------------------------------------------------------------------------
## Variable Adj.
## Step Removed R-Square R-Square C(p) AIC RMSE
## -----------------------------------------------------------------------------
## 1 cls_profs 0.1728 0.1442 10.0291 542.9947 0.5039
## 2 cls_level 0.1727 0.1465 8.0595 541.0263 0.5032
## 3 cls_students 0.172 0.1483 6.3349 539.3127 0.5027
## 4 rank 0.1667 0.1477 6.5770 537.6357 0.5029
## -----------------------------------------------------------------------------
BEST SUBSET REGRESSION
STEPWISE BACKWARD REGRESSION
STEPWISE FORWARD REGRESSION
Lower AIC much better Should you use forward or backward stepwise selection?
only consider models with number of variables less than the sample size (for linear regression)
source https://www.jmlr.org/papers/volume20/17-334/17-334.pdf
Unless the number of candidate variables > sample size (or number of events), use a backward stepwise approach.
https://bookdown.org/yihui/rmarkdown-cookbook/update-date.html
https://www.rdocumentation.org/packages/ggplot2/versions/1.0.1/topics/geom_bar
The best way to predict the future is to create it.” Abraham Lincoln.
End of Document.