# Instrumental Variables example
# install.packages("sem")
library(sem)
## Warning: package 'sem' was built under R version 4.3.3
wages<-read.csv('http://inta.gatech.s3.amazonaws.com/wage2.csv')
iv.results<-tsls(lwage ~ educ + age + married + black, ~ feduc + age + married + black, data=wages)
# Run an instrumental variables regression. Note that father's education is
# used for an instrument, and is not in the first set of variables. Age,
# marital, status, and race are assumed to be less related to underlying
# ability and so are controlled for in the "first stage" regression too.
ols.results<-lm(lwage ~ educ + age + married + black, data=wages)
# Run the analogous OLS regression without father's education
print(summary(ols.results))
##
## Call:
## lm(formula = lwage ~ educ + age + married + black, data = wages)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.91980 -0.24114 0.01988 0.25465 1.31126
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.231197 0.159806 32.735 < 2e-16 ***
## educ 0.056040 0.005820 9.629 < 2e-16 ***
## age 0.019539 0.004062 4.810 1.76e-06 ***
## married 0.194424 0.040952 4.748 2.38e-06 ***
## black -0.209969 0.038208 -5.495 5.03e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3834 on 930 degrees of freedom
## Multiple R-squared: 0.1747, Adjusted R-squared: 0.1711
## F-statistic: 49.21 on 4 and 930 DF, p-value: < 2.2e-16
print(summary(iv.results))
##
## 2SLS Estimates
##
## Model Formula: lwage ~ educ + age + married + black
##
## Instruments: ~feduc + age + married + black
##
## Residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.85005 -0.24061 0.02436 0.00000 0.26225 1.34065
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.62589626 0.26729720 17.30619 < 2.22e-16 ***
## educ 0.09640575 0.01586492 6.07666 1.9681e-09 ***
## age 0.02106442 0.00472040 4.46242 9.3723e-06 ***
## married 0.20134756 0.04651945 4.32824 1.7109e-05 ***
## black -0.11997207 0.05366054 -2.23576 0.025667 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3944321 on 736 degrees of freedom
# Note that the point estimate for education has now gone up. This suggests
# that people who have higher education in a way that's correlated with
# their father having higher education, even holding fixed age, marital
# status, and race, receive ~9% higher wages for every year of education
# Also note that the standard error on education is higher for the iv than
# ols results. The reason for this is that in IV, we're using only some of
# the variation in education: only the part related to father's education,
# so it's like there's less variation overall
# Stepwise regression and tree example
# install.packages(c('MASS','sem'))
library(MASS)
start.model<-lm(wage ~ hours + IQ + KWW + educ + exper + tenure + age + married + black + south + urban + sibs, data=wages)
# Give an initial model, which will be the most coefficients we'd want to ever use
summary(start.model)
##
## Call:
## lm(formula = wage ~ hours + IQ + KWW + educ + exper + tenure +
## age + married + black + south + urban + sibs, data = wages)
##
## Residuals:
## Min 1Q Median 3Q Max
## -849.76 -223.19 -40.18 172.09 2091.17
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -562.589 185.142 -3.039 0.00244 **
## hours -3.485 1.621 -2.150 0.03182 *
## IQ 2.828 1.004 2.817 0.00495 **
## KWW 5.261 1.981 2.656 0.00804 **
## educ 48.213 7.182 6.713 3.33e-11 ***
## exper 9.774 3.589 2.723 0.00658 **
## tenure 5.242 2.406 2.178 0.02963 *
## age 5.046 4.930 1.024 0.30634
## married 170.231 37.883 4.494 7.89e-06 ***
## black -108.844 39.808 -2.734 0.00637 **
## south -53.689 25.536 -2.102 0.03578 *
## urban 160.652 26.200 6.132 1.29e-09 ***
## sibs -1.068 5.455 -0.196 0.84488
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 352.6 on 922 degrees of freedom
## Multiple R-squared: 0.2493, Adjusted R-squared: 0.2395
## F-statistic: 25.51 on 12 and 922 DF, p-value: < 2.2e-16
stepwise.model<- step(start.model)
## Start: AIC=10981.26
## wage ~ hours + IQ + KWW + educ + exper + tenure + age + married +
## black + south + urban + sibs
##
## Df Sum of Sq RSS AIC
## - sibs 1 4763 114655843 10979
## - age 1 130265 114781346 10980
## <none> 114651080 10981
## - south 1 549683 115200763 10984
## - hours 1 574799 115225880 10984
## - tenure 1 590101 115241181 10984
## - KWW 1 877455 115528536 10986
## - exper 1 922291 115573372 10987
## - black 1 929659 115580739 10987
## - IQ 1 986880 115637960 10987
## - married 1 2510929 117162009 11000
## - urban 1 4675218 119326298 11017
## - educ 1 5603726 120254807 11024
##
## Step: AIC=10979.3
## wage ~ hours + IQ + KWW + educ + exper + tenure + age + married +
## black + south + urban
##
## Df Sum of Sq RSS AIC
## - age 1 128704 114784547 10978
## <none> 114655843 10979
## - south 1 546953 115202796 10982
## - hours 1 575305 115231148 10982
## - tenure 1 590732 115246575 10982
## - KWW 1 911607 115567450 10985
## - exper 1 926789 115582632 10985
## - black 1 1000335 115656178 10985
## - IQ 1 1002450 115658294 10985
## - married 1 2508266 117164110 10998
## - urban 1 4686063 119341906 11015
## - educ 1 5673927 120329770 11022
##
## Step: AIC=10978.35
## wage ~ hours + IQ + KWW + educ + exper + tenure + married + black +
## south + urban
##
## Df Sum of Sq RSS AIC
## <none> 114784547 10978
## - hours 1 562954 115347501 10981
## - south 1 565737 115350283 10981
## - tenure 1 680663 115465209 10982
## - IQ 1 907667 115692213 10984
## - black 1 980306 115764853 10984
## - KWW 1 1413125 116197671 10988
## - exper 1 1678662 116463208 10990
## - married 1 2538972 117323519 10997
## - urban 1 4639497 119424043 11013
## - educ 1 6097302 120881849 11025
# The command "step" adds and subtracts coefficients to maximize a measure of
# goodness of fit, by default AIC
summary(stepwise.model)
##
## Call:
## lm(formula = wage ~ hours + IQ + KWW + educ + exper + tenure +
## married + black + south + urban, data = wages)
##
## Residuals:
## Min 1Q Median 3Q Max
## -872.52 -224.26 -41.71 171.08 2086.14
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -453.2969 141.0390 -3.214 0.001354 **
## hours -3.4474 1.6194 -2.129 0.033536 *
## IQ 2.6621 0.9848 2.703 0.006996 **
## KWW 6.0918 1.8062 3.373 0.000775 ***
## educ 49.4804 7.0627 7.006 4.73e-12 ***
## exper 11.5519 3.1425 3.676 0.000251 ***
## tenure 5.5777 2.3828 2.341 0.019455 *
## married 171.1045 37.8476 4.521 6.96e-06 ***
## black -109.2993 38.9083 -2.809 0.005072 **
## south -54.4073 25.4951 -2.134 0.033103 *
## urban 159.8942 26.1639 6.111 1.46e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 352.5 on 924 degrees of freedom
## Multiple R-squared: 0.2484, Adjusted R-squared: 0.2402
## F-statistic: 30.53 on 10 and 924 DF, p-value: < 2.2e-16
# install.packages('tree')
library(tree)
wage.tree <- tree(wage ~ married + hours + IQ + KWW + educ + tenure + exper, method="anova", data=wages)
# fit a tree
summary(wage.tree)
##
## Regression tree:
## tree(formula = wage ~ married + hours + IQ + KWW + educ + tenure +
## exper, data = wages, method = "anova")
## Variables actually used in tree construction:
## [1] "KWW" "IQ" "educ"
## Number of terminal nodes: 6
## Residual mean deviance: 130000 = 120800000 / 929
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -891.70 -250.10 -27.35 0.00 188.60 2137.00
# Print the results
plot(wage.tree)
text(wage.tree)

# Plot the tree, and add the text decisions
cross.validation <- cv.tree(wage.tree)
# perform cross-validation, 10-fold.
cross.validation
## $size
## [1] 6 5 4 3 2 1
##
## $dev
## [1] 129775034 130486576 130848479 133364868 140427226 152962978
##
## $k
## [1] -Inf 2430505 2880143 3466767 7749956 15376573
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
# look for the size with the lowest "dev" or deviance
# Why might this be different than the fitted tree?
pruned.wage.tree<-prune.tree(wage.tree,best=5)
# Prune this tree down to 5 terminal nodes, just to show this is how you would
# do it. Here this makes it worse though
summary(pruned.wage.tree)
##
## Regression tree:
## snip.tree(tree = wage.tree, nodes = 14L)
## Variables actually used in tree construction:
## [1] "KWW" "IQ" "educ"
## Number of terminal nodes: 5
## Residual mean deviance: 132500 = 123200000 / 930
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -891.70 -250.80 -26.53 0.00 190.40 2137.00
lmfit<- lm(wage ~ married + hours + IQ + KWW + educ + tenure + exper, data=wages)
# Compute a regression using those same variables:
anova(lmfit)
## Analysis of Variance Table
##
## Response: wage
## Df Sum Sq Mean Sq F value Pr(>F)
## married 1 2848893 2848893 21.7647 3.535e-06 ***
## hours 1 29759 29759 0.2273 0.6336108
## IQ 1 14964264 14964264 114.3225 < 2.2e-16 ***
## KWW 1 6619303 6619303 50.5695 2.297e-12 ***
## educ 1 4117977 4117977 31.4601 2.687e-08 ***
## tenure 1 1277563 1277563 9.7602 0.0018387 **
## exper 1 1518568 1518568 11.6014 0.0006873 ***
## Residuals 927 121339841 130895
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
summary(wage.tree)
##
## Regression tree:
## tree(formula = wage ~ married + hours + IQ + KWW + educ + tenure +
## exper, data = wages, method = "anova")
## Variables actually used in tree construction:
## [1] "KWW" "IQ" "educ"
## Number of terminal nodes: 6
## Residual mean deviance: 130000 = 120800000 / 929
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -891.70 -250.10 -27.35 0.00 188.60 2137.00
# Notice that the mean sum of squared residuals was 130895 using the linear
# model, compared to the residual mean deviance of 130000 using trees. So the
# tree method has less residual error, which means it's a better predictor.
# This is especially striking given that the tree only uses KWW, educ, IQ,
# While the regression needs substantial contributions from tenure, experience,
# and marital status too.
predict(wage.tree, newdata=data.frame(IQ=100, KWW=50, educ=16, married=1, hours=40, tenure=3, exper=2))
## 1
## 1468.696
predict(lmfit, newdata=data.frame(IQ=100, KWW=50, educ=16, married=1, hours=40, tenure=3, exper=2))
## 1
## 1089.293
# These predictors give different estimates for the expected wage of a
# particular individual, relatively young, married, well educated, with good
# knowledge of the world of work. Which would you trust and why?
# Extension: try to build another tree, and see what it looks like.
# You can sample the data with the command:
# install.packages('dplyr')
# library('dplyr')
# wages.sample <- sample(wages, n=500)
# Takehome exercise from last class
lfp <- read.csv('http://inta.gatech.s3.amazonaws.com/mroz_train.csv')
lfp$inlf<-as.factor(lfp$inlf) # We need the outcome variable to be a 'factor'
inlf.tree <-tree(as.numeric(inlf) - 1 ~huseduc + husage + kidslt6 + kidsge6 + nwifeinc + educ + age, data=lfp, na.action=na.omit) # Fit the model
inlf.lm <- lm(as.numeric(inlf) - 1 ~huseduc + husage + kidslt6 + kidsge6 + nwifeinc + educ + age, data=lfp) # Fit the model
print(inlf.tree)
## node), split, n, deviance, yval
## * denotes terminal node
##
## 1) root 602 148.2000 0.56150
## 2) kidslt6 < 0.5 478 113.8000 0.60880
## 4) husage < 48.5 273 55.2800 0.71790
## 8) nwifeinc < 27.936 224 40.4600 0.76340
## 16) educ < 9.5 19 4.7370 0.47370 *
## 17) educ > 9.5 205 33.9800 0.79020
## 34) kidsge6 < 2.5 153 20.2400 0.84310 *
## 35) kidsge6 > 2.5 52 12.0600 0.63460 *
## 9) nwifeinc > 27.936 49 12.2400 0.51020
## 18) huseduc < 12.5 11 0.9091 0.09091 *
## 19) huseduc > 12.5 38 8.8420 0.63160
## 38) nwifeinc < 31.85 11 2.1820 0.27270 *
## 39) nwifeinc > 31.85 27 4.6670 0.77780 *
## 5) husage > 48.5 205 50.9800 0.46340
## 10) educ < 12.5 155 37.2000 0.40000 *
## 11) educ > 12.5 50 11.2200 0.66000 *
## 3) kidslt6 > 0.5 124 29.1900 0.37900 *
# compute predictions manually, and save them as lfp$predicted.inlf
lfp$predicted.inlf<-predict(inlf.tree)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
evaluate <- function(y_true, y_predicted, detail=FALSE) {
curve<-roc(y_true, y_predicted)
if (detail) {
print(ci.auc(curve))
print('Sensitivity (True Positive Rate)')
tpr<-ci.se(curve)
print(tpr)
print('Specificity (True Negative Rate)')
tnr<-ci.sp(curve)
print(tnr)
}
cat('Point estimate (final word on effectiveness)\n')
print(auc(curve))
#print('Point estimate of AUCROC: ' + auc(curve))
}
evaluate(lfp$inlf, lfp$predicted.inlf)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Point estimate (final word on effectiveness)
## Area under the curve: 0.7283
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.3.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
lfp <- read.csv('http://inta.gatech.s3.amazonaws.com/mroz_train.csv')
lfp[is.na(lfp)] <- 0
rf <-randomForest(as.factor(inlf) ~ hours + kidslt6 , data=lfp)
evaluate(as.factor(lfp$inlf), predict(rf,type="prob")[,1])
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
## Point estimate (final word on effectiveness)
## Area under the curve: 1
lfp.out <- read.csv('http://inta.gatech.s3.amazonaws.com/mroz_test.csv')
lfp.out[is.na(lfp.out)] <- 0
evaluate(as.factor(lfp.out$inlf), predict(rf, newdata=lfp.out, type="prob")[,1])
## Setting levels: control = 0, case = 1
## Setting direction: controls > cases
## Point estimate (final word on effectiveness)
## Area under the curve: 1
evaluate(lfp.out$inlf, predict(inlf.lm, newdata=lfp.out))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Point estimate (final word on effectiveness)
## Area under the curve: 0.7322
evaluate(lfp.out$inlf, predict(inlf.tree, newdata=lfp.out))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Point estimate (final word on effectiveness)
## Area under the curve: 0.6488
###################################################################################
# CHANGE DESCRIPTION
############################### CODE CHANGE STARTS ################################
summary(wages)
## wage hours IQ KWW
## Min. : 115.0 Min. :20.00 Min. : 50.0 Min. :12.00
## 1st Qu.: 669.0 1st Qu.:40.00 1st Qu.: 92.0 1st Qu.:31.00
## Median : 905.0 Median :40.00 Median :102.0 Median :37.00
## Mean : 957.9 Mean :43.93 Mean :101.3 Mean :35.74
## 3rd Qu.:1160.0 3rd Qu.:48.00 3rd Qu.:112.0 3rd Qu.:41.00
## Max. :3078.0 Max. :80.00 Max. :145.0 Max. :56.00
##
## educ exper tenure age
## Min. : 9.00 Min. : 1.00 Min. : 0.000 Min. :28.00
## 1st Qu.:12.00 1st Qu.: 8.00 1st Qu.: 3.000 1st Qu.:30.00
## Median :12.00 Median :11.00 Median : 7.000 Median :33.00
## Mean :13.47 Mean :11.56 Mean : 7.234 Mean :33.08
## 3rd Qu.:16.00 3rd Qu.:15.00 3rd Qu.:11.000 3rd Qu.:36.00
## Max. :18.00 Max. :23.00 Max. :22.000 Max. :38.00
##
## married black south urban
## Min. :0.000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.000 Median :0.0000 Median :0.0000 Median :1.0000
## Mean :0.893 Mean :0.1283 Mean :0.3412 Mean :0.7176
## 3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## sibs brthord meduc feduc
## Min. : 0.000 Min. : 1.000 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 8.00 1st Qu.: 8.00
## Median : 2.000 Median : 2.000 Median :12.00 Median :10.00
## Mean : 2.941 Mean : 2.277 Mean :10.68 Mean :10.22
## 3rd Qu.: 4.000 3rd Qu.: 3.000 3rd Qu.:12.00 3rd Qu.:12.00
## Max. :14.000 Max. :10.000 Max. :18.00 Max. :18.00
## NA's :83 NA's :78 NA's :194
## lwage
## Min. :4.745
## 1st Qu.:6.506
## Median :6.808
## Mean :6.779
## 3rd Qu.:7.056
## Max. :8.032
##
wage2.tree <- tree(hours ~ KWW + educ + tenure + age + married + exper, method="anova", data=wages)
plot(wage2.tree)
text(wage2.tree)

predict(wage2.tree, newdata = data.frame(KWW=37.00, educ=19, tenure=7.00, age=24,
married=0.00, exper=4))
## 1
## 47.4386
############################### CODE CHANGE ENDS ##################################
# In this code change, I was aiming to see how many hours an employee would have to
# work if they had a Knowledge of the World of Work, education, tenure, age, if they
# were married, and had experience. A decision tree is plotted showing different
# outcomes. I then wanted to predict what would be my output for
# the hours an employee would have to work if their KWW=37.00 (median value in summary
# ), educ=19 (k-12 + 4 years bachelors + 2 years masters), tenure=7.00 (median
# value in summary), age=24, not married and experience is 4 years. The output
# is 47.43, and this matches the decision tree plotted. The prediction is
# 47.43, while the prediction in the decision tree is 47.44, which is very close.
# END OF CHANGE DESCRIPTION
####################################################################################
####################################################################################
# CHANGE DESCRIPTION
################################## CODE CHANGE STARTS ##############################
summary(wages)
## wage hours IQ KWW
## Min. : 115.0 Min. :20.00 Min. : 50.0 Min. :12.00
## 1st Qu.: 669.0 1st Qu.:40.00 1st Qu.: 92.0 1st Qu.:31.00
## Median : 905.0 Median :40.00 Median :102.0 Median :37.00
## Mean : 957.9 Mean :43.93 Mean :101.3 Mean :35.74
## 3rd Qu.:1160.0 3rd Qu.:48.00 3rd Qu.:112.0 3rd Qu.:41.00
## Max. :3078.0 Max. :80.00 Max. :145.0 Max. :56.00
##
## educ exper tenure age
## Min. : 9.00 Min. : 1.00 Min. : 0.000 Min. :28.00
## 1st Qu.:12.00 1st Qu.: 8.00 1st Qu.: 3.000 1st Qu.:30.00
## Median :12.00 Median :11.00 Median : 7.000 Median :33.00
## Mean :13.47 Mean :11.56 Mean : 7.234 Mean :33.08
## 3rd Qu.:16.00 3rd Qu.:15.00 3rd Qu.:11.000 3rd Qu.:36.00
## Max. :18.00 Max. :23.00 Max. :22.000 Max. :38.00
##
## married black south urban
## Min. :0.000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.000 Median :0.0000 Median :0.0000 Median :1.0000
## Mean :0.893 Mean :0.1283 Mean :0.3412 Mean :0.7176
## 3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## sibs brthord meduc feduc
## Min. : 0.000 Min. : 1.000 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 8.00 1st Qu.: 8.00
## Median : 2.000 Median : 2.000 Median :12.00 Median :10.00
## Mean : 2.941 Mean : 2.277 Mean :10.68 Mean :10.22
## 3rd Qu.: 4.000 3rd Qu.: 3.000 3rd Qu.:12.00 3rd Qu.:12.00
## Max. :14.000 Max. :10.000 Max. :18.00 Max. :18.00
## NA's :83 NA's :78 NA's :194
## lwage
## Min. :4.745
## 1st Qu.:6.506
## Median :6.808
## Mean :6.779
## 3rd Qu.:7.056
## Max. :8.032
##
lmfit2 <- lm(hours ~ KWW + educ + tenure + age + married + exper, data=wages)
anova(lmfit2)
## Analysis of Variance Table
##
## Response: hours
## Df Sum Sq Mean Sq F value Pr(>F)
## KWW 1 632 632.27 12.3139 0.0004712 ***
## educ 1 126 125.72 2.4486 0.1179714
## tenure 1 223 222.94 4.3419 0.0374579 *
## age 1 0 0.45 0.0088 0.9252333
## married 1 47 47.18 0.9188 0.3380411
## exper 1 68 67.88 1.3219 0.2505429
## Residuals 928 47649 51.35
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
predict(lmfit2, newdata = data.frame(KWW=37.00, educ=19, tenure=7.00, age=24,
married=0.00, exper=4))
## 1
## 44.01569
################################## CODE CHANGE ENDS ################################
# For this change, I wanted to run a linear regression test to see how many hours
# an employee would have to work if they had KWW (37.00 - Median value in summary),
# education (19 years - k-12 + 4 years bachelors + 2 years masters), tenure (7.00 -
# median value in summary), age (24 years old), married (no), experience (4 years).
# The output indicates the employee will have to work 44 hours.
# In terms of the
# decision tree and the linear regression test, it would be better to trust the
# Decision Tree model more since decision trees can predict y better than a linear
# regression model.
# The decision tree accounts for the variables of education,
# KWW, and tenure and produces an output based on the branches of the tree.
# The output for the decision tree prediction is 47.4386 hours, while
# the linear regression model output is 44.01569 hours.
# END OF CHANGE DESCRIPTION
####################################################################################