library(tree)
library(DT)
library(kableExtra)
library(rpart)
library(faraway)
library(maptree)
library(tidyverse)
library(gridExtra)

1 Introduction

The dataset used in this analysis is the uswages dataset which has 2000 rows and 10 columns. The dataset is based on Weekly Wages for US male workers sampled from a Population Survey conducted in 1988. All the predictors in the dataset were used in this analysis with the aim of investigating how they can predict weekly wages of individuals.

data("uswages")#Loading the entire uswages dataset#
datatable(uswages,rownames=F,filter="top")

2 Exploratory Analysis

dplyr::glimpse(uswages)  #Summary statistics on the dataset

## Rows: 2,000
## Columns: 10
## $ wage  <dbl> 771.60, 617.28, 957.83, 617.28, 902.18, 299.15, 541.31, 148.39, ~
## $ educ  <int> 18, 15, 16, 12, 14, 12, 16, 16, 12, 12, 9, 14, 17, 14, 14, 10, 1~
## $ exper <int> 18, 20, 9, 24, 12, 33, 42, 0, 36, 37, 20, 29, 16, 21, 11, 10, 8,~
## $ race  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ smsa  <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1~
## $ ne    <int> 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0~
## $ mw    <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0~
## $ so    <int> 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1~
## $ we    <int> 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0~
## $ pt    <int> 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0~

The scatterplot matrix Figure ?? shows that there is weak both positive and negative correlation between weekly wages and the predictors in the dataset. There is weak positive correlation between weekly wages and years of education (0.248), and years of experience (0.183). However, Years of education and years of experience is negatively correlated (-0.302).

if (require(GGally)){             #Scatterplot matrix on the predictors in the dataset
  uswages %>%
    ggpairs() + ggtitle("Weekly Wages of US Male Workers in 1988") + theme(plot.title = element_text(hjust = 0.5))
}

Figure 2.1: Scatterplot Matrix

Figure 2.2 confirms the observation in Figure 2.1. Weekly wages increases with increasing number of years of education. This is also true for years of experience. However, it appears years of experience do not have significant impact on an individual’s weekly wages.

if(require(ggplot2)){      #Scatterplot of Years of education on weekly wages as well as years of experience on weekly wages
  A=ggplot(uswages)+geom_point(aes(x=educ,y=wage),color="brown")+theme_bw()+xlab("Years of Education")+ylab("Weekly Wages(Dollars)")+theme(panel.grid = element_blank())
  B=ggplot(uswages)+geom_point(aes(x=exper,y=wage),color="brown")+theme_bw()+xlab("Years of Experience")+ylab("Weekly Wages(Dollars)")+theme(panel.grid = element_blank())
  C=ggplot(uswages)+geom_point(aes(x=race,y=wage),color="brown")+theme_bw()+xlab("Race")+ylab("Weekly Wages(Dollars)")+theme(panel.grid = element_blank())
  D=ggplot(uswages)+geom_point(aes(x=smsa,y=wage),color="brown")+theme_bw()+xlab("Standard Metropolitan Area")+ylab("Weekly Wages(Dollars)")+theme(panel.grid = element_blank())
  E=ggplot(uswages)+geom_point(aes(x=ne,y=wage),color="brown")+theme_bw()+xlab("North East")+ylab("Weekly Wages(Dollars)")+theme(panel.grid = element_blank())
  G=ggplot(uswages)+geom_point(aes(x=we,y=wage),color="brown")+theme_bw()+xlab("West")+ylab("Weekly Wages(Dollars)")+theme(panel.grid = element_blank())
  H=ggplot(uswages)+geom_point(aes(x=so,y=wage),color="brown")+theme_bw()+xlab("South")+ylab("Weekly Wages(Dollars)")+theme(panel.grid = element_blank())
  I=ggplot(uswages)+geom_point(aes(x=pt,y=wage),color="brown")+theme_bw()+xlab("Part Time/Not")+ylab("Weekly Wages(Dollars)")+theme(panel.grid = element_blank())
  J=ggplot(uswages)+geom_point(aes(x=mw,y=wage),color="brown")+theme_bw()+xlab("Midwest")+ylab("Weekly Wages(Dollars)")+theme(panel.grid = element_blank())
  grid.arrange(A,B,C,D,E,G,H,I,J,nrow =3,ncol = 3)
}

Figure 2.2: Scatterplot of Weekly Wages vs All Predictors

Figure 2.3 is a plot of years of education and years of experience on weekly wages. Significantly positive correlation exist between weekly wages and years of education as an increase in years of education is observed to cause an increase in an individual’s weekly wages. However, Years of experience appears not having significant impact on weekly wages. Most individuals with the lowest weekly wages have the lowest years of education but highest years of experience. Individuals with significant years of education have high weekly wages even though they have lower years of experience.

if(require(ggplot2)){ #Scatterplot of both years of education and years of experience on weekly wages
  ggplot(data=sample_n(uswages,size=2000),
         aes(x=educ,y=wage,color=exper))+geom_point()+
    geom_smooth()+theme_bw()+xlab("Education")+ylab("Wage")+theme(panel.grid = element_blank())
}

Figure 2.3: Scatterplot of Weekly Wages on Education and Experience

Figure 2.4 shows the distribution of the target variable, weekly wages is asymmetrical as it is observed to be right skewed. The distribution of years of education also appears asymmetrical whereas that of years of experience appears to be symmetrical. However, the variable exper appears to include negative values.

plotHist <- function(columns,bin,colours){#Exploratory analysis on the integer variables with missing records
  par(mfrow = c(4,3))   #Histogram plots to visualize the distribution of the variables.
  for (i in columns) {
    hist(uswages[,i], main = paste("Histogram of ", names(uswages)[i]),
         nclass = bin, las = 1, col = colours, 
         xlab = paste(names(uswages)[i]))
  }
}
plotHist(c(1:10), c(rep(5,10)), "brown")

Figure 2.4: Histogram of the Predictors

3 Feature Selection

Six variables (educ, exper, smsa, race, pt and mw) were selected from the stepwise selection at 0.05 significant level.

#Selecting significant variables in the dataset with stepwise selection
Full_model=lm(wage ~ educ + exper + race + smsa + ne + mw + so + we + pt,data=uswages) 
Stepwise_model=step(Full_model,direction="both",test="F")

## Start:  AIC=24093.99
## wage ~ educ + exper + race + smsa + ne + mw + so + we + pt
## 
## 
## Step:  AIC=24093.99
## wage ~ educ + exper + race + smsa + ne + mw + so + pt
## 
##         Df Sum of Sq       RSS   AIC  F value    Pr(>F)    
## <none>               338114663 24094                       
## - so     1    621152 338735814 24096   3.6577 0.0559549 .  
## - ne     1    631096 338745759 24096   3.7162 0.0540279 .  
## - mw     1    822806 338937469 24097   4.8451 0.0278388 *  
## - race   1   1946924 340061587 24104  11.4645 0.0007232 ***
## - smsa   1   4808776 342923438 24120  28.3166 1.146e-07 ***
## - pt     1  18819536 356934199 24200 110.8195 < 2.2e-16 ***
## - exper  1  26872523 364987185 24245 158.2398 < 2.2e-16 ***
## - educ   1  38320052 376434714 24307 225.6490 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#The variables; educ, exper, race, smsa,pt and mw appeared significant at 0.05 sig. level

“mw” appeared insignificant at α=0.05 with p-value of 0.279 when a linear regression model was built on all the six variables.

Select_model=lm(wage~educ + exper + smsa + race + mw + pt, data=uswages)
summary(Select_model) #Fitting the model again with the significant variables. The variable mw appeared insignificant at 0.05 significant level

## 
## Call:
## lm(formula = wage ~ educ + exper + smsa + race + mw + pt, data = uswages)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -891.3 -213.5  -54.6  127.5 7493.9 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -237.6379    51.1609  -4.645 3.62e-06 ***
## educ          48.7479     3.2486  15.006  < 2e-16 ***
## exper          9.0684     0.7259  12.493  < 2e-16 ***
## smsa         114.2617    21.6089   5.288 1.37e-07 ***
## race        -126.8249    34.6432  -3.661 0.000258 ***
## mw           -23.1911    21.4277  -1.082 0.279253    
## pt          -335.7629    31.9594 -10.506  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 412.4 on 1993 degrees of freedom
## Multiple R-squared:  0.1982, Adjusted R-squared:  0.1957 
## F-statistic: 82.09 on 6 and 1993 DF,  p-value: < 2.2e-16

4 Building of OLS Linear Regression Model

Data Splitting: The data set was split randomly at 70% (1400 observations) as the train data and 30% (600) as the test data.

set.seed(702)
Subset_dat=uswages[,c("wage","educ","exper","pt","smsa","race")]
n=dim(Subset_dat)[1]  #Building OLD linear regression model with the 5 sig. variables and randomly splitting the data 70% for
                      #for the train data and 30% for test data. 
Raw_data=sample(1:n,size=n*0.7,replace = F)
train_dat=uswages[Raw_data,]
test_dat=uswages[-Raw_data,]

Ordinary Least Squares(OLS) linear regression model was built on the train data and the model performance evaluated on the test data.The root means square error (RMSE) associated with the model was 452.7986, indicating that the model leads to test predictions that are within around $452.7986 of the true median weekly wages for individuals in the data set.

#Fitting the linear regression on the training data containing only the five
lm_model<-lm(wage~educ + exper + smsa + race + pt, data=train_dat)   
lm_pred<- predict(lm_model, test_dat) #significant variables and testing the model performance on the test data
print('Root Mean Square Error(RMSE) using linear model:')

## [1] "Root Mean Square Error(RMSE) using linear model:"

lm_RMSE=sqrt(mean((lm_pred - test_dat[,"wage"])^2))                       
lm_RMSE   #The test Mean Square Error associated with the OLD linear regression model is 452.7986

## [1] 452.7986

New_lm=data.frame(lm_pred,test_dat[,"wage"])
ggplot(New_lm)+geom_point(aes(x=lm_pred,y=test_dat[,"wage"]),color="brown")+theme_bw()+xlab("Predicted Values")+ylab("Wage (Observed Values")+
  theme(panel.grid = element_blank())#A plot of the predicted values made on the linear regression model Vs the observed values in the test data

Figure 4.1: A plot of the observed Vs OLS predicted weekly wages

Export=data.frame(Observed_Wages=test_dat[,"wage"],Predicted_Wages=lm_pred)
datatable(Export,filter = "top",rownames = F)   #Scoring dataset for the OLD linear regression model on the data

5 Building of Tree Regression Model

The full data set was split randomly at 70% (1400 observations) for the train data and 30% (600 observations) for the test data.

set.seed(702)
n=dim(uswages)[1]
Tree_data=sample(1:n,size=n*0.7,replace = F)#Randomly splitting the full uswages dataset into 70% train data and 30% test data
Tree_train_dat=uswages[Tree_data,]                             
Tree_test_dat=uswages[-Tree_data,]

Two different tree model were built on the train data. The first Figure 5.1 was built with minbucket value of 50 and cost complexity value of 0.01.

Tree_Model1=rpart(wage~educ + exper + race + smsa + ne + mw + so + we + pt, Tree_train_dat, minbucket=50, cp=.01)
summary(Tree_Model1)          #Building first tree model on the train data with cost complexity parameter of 0.01 and minbucket of 50

## Call:
## rpart(formula = wage ~ educ + exper + race + smsa + ne + mw + 
##     so + we + pt, data = Tree_train_dat, minbucket = 50, cp = 0.01)
##   n= 1400 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.08141626      0 1.0000000 1.0012468 0.10797429
## 2 0.05494877      1 0.9185837 0.9046214 0.10427101
## 3 0.04037137      2 0.8636350 0.8680815 0.10242039
## 4 0.01421220      3 0.8232636 0.8554478 0.10127198
## 5 0.01167710      4 0.8090514 0.8451233 0.10084320
## 6 0.01000000      6 0.7856972 0.8452650 0.09970098
## 
## Variable importance
## exper  educ  smsa    pt 
##    50    41     6     3 
## 
## Node number 1: 1400 observations,    complexity param=0.08141626
##   mean=609.0972, MSE=194781.2 
##   left son=2 (1042 obs) right son=3 (358 obs)
##   Primary splits:
##       educ  < 15.5 to the left,  improve=0.08141626, (0 missing)
##       exper < 6.5  to the left,  improve=0.08031668, (0 missing)
##       pt    < 0.5  to the right, improve=0.06714110, (0 missing)
##       smsa  < 0.5  to the left,  improve=0.02506221, (0 missing)
##       race  < 0.5  to the right, improve=0.01012986, (0 missing)
## 
## Node number 2: 1042 observations,    complexity param=0.05494877
##   mean=535.2835, MSE=137614.8 
##   left son=4 (254 obs) right son=5 (788 obs)
##   Primary splits:
##       exper < 8.5  to the left,  improve=0.104496200, (0 missing)
##       pt    < 0.5  to the right, improve=0.063553440, (0 missing)
##       smsa  < 0.5  to the left,  improve=0.023823630, (0 missing)
##       educ  < 13.5 to the left,  improve=0.007382548, (0 missing)
##       race  < 0.5  to the right, improve=0.007300133, (0 missing)
##   Surrogate splits:
##       pt < 0.5  to the right, agree=0.769, adj=0.051, (0 split)
## 
## Node number 3: 358 observations,    complexity param=0.04037137
##   mean=823.9403, MSE=299154.9 
##   left son=6 (86 obs) right son=7 (272 obs)
##   Primary splits:
##       exper < 6.5  to the left,  improve=0.102794400, (0 missing)
##       educ  < 17.5 to the left,  improve=0.040051080, (0 missing)
##       mw    < 0.5  to the right, improve=0.007169756, (0 missing)
##       we    < 0.5  to the left,  improve=0.006666140, (0 missing)
##       smsa  < 0.5  to the left,  improve=0.006588458, (0 missing)
##   Surrogate splits:
##       pt < 0.5  to the right, agree=0.777, adj=0.07, (0 split)
## 
## Node number 4: 254 observations
##   mean=324.0665, MSE=54694.06 
## 
## Node number 5: 788 observations,    complexity param=0.0142122
##   mean=603.3662, MSE=145327.6 
##   left son=10 (227 obs) right son=11 (561 obs)
##   Primary splits:
##       smsa  < 0.5  to the left,  improve=0.033842500, (0 missing)
##       educ  < 13.5 to the left,  improve=0.025361930, (0 missing)
##       exper < 45.5 to the right, improve=0.016949020, (0 missing)
##       so    < 0.5  to the right, improve=0.012174650, (0 missing)
##       race  < 0.5  to the right, improve=0.009381634, (0 missing)
## 
## Node number 6: 86 observations
##   mean=512.0743, MSE=107946 
## 
## Node number 7: 272 observations,    complexity param=0.0116771
##   mean=922.545, MSE=319136.3 
##   left son=14 (166 obs) right son=15 (106 obs)
##   Primary splits:
##       educ  < 17.5 to the left,  improve=0.027492650, (0 missing)
##       exper < 11.5 to the left,  improve=0.025944950, (0 missing)
##       we    < 0.5  to the left,  improve=0.009313403, (0 missing)
##       mw    < 0.5  to the right, improve=0.004853830, (0 missing)
##       so    < 0.5  to the right, improve=0.001823206, (0 missing)
##   Surrogate splits:
##       pt < 0.5  to the left,  agree=0.621, adj=0.028, (0 split)
## 
## Node number 10: 227 observations
##   mean=493.1174, MSE=72509.47 
## 
## Node number 11: 561 observations
##   mean=647.9766, MSE=167883.9 
## 
## Node number 14: 166 observations,    complexity param=0.0116771
##   mean=847.6944, MSE=290513.8 
##   left son=28 (66 obs) right son=29 (100 obs)
##   Primary splits:
##       exper < 13.5 to the left,  improve=0.082571610, (0 missing)
##       ne    < 0.5  to the right, improve=0.001728187, (0 missing)
##   Surrogate splits:
##       mw < 0.5  to the right, agree=0.608, adj=0.015, (0 split)
## 
## Node number 15: 106 observations
##   mean=1039.764, MSE=341446.1 
## 
## Node number 28: 66 observations
##   mean=657.0488, MSE=98995.22 
## 
## Node number 29: 100 observations
##   mean=973.5205, MSE=377095.7

In Figure 5.1, it is observed that the highest average weekly wages (1040 dollars) is observed among individuals with more than 18 years of education and has more than 6.5 years of work experience. Individuals with education level (between 16 and 18 years) have different earnings due to differences in their work experience. That is, individuals with education level between 16 and 18 years and has more than 14 years of work experience makes 317 dollars more than individuals with the same level of education but has less than 14 years of experience. It is also observed that individuals with the same level of education and experience have different weekly earnings because of differences in the metropolitan areas they live. Thus, individuals with less than 16 years of education and has more than 8.5 years of work experience but lives in standard metropolitan statistical area makes average weekly income of 648 dollars which is 155 dollars more than that of individuals with the same characteristics but not living in standard metropolitan statistical area. Individuals with less than 16 years of education and has less than 8.5 years of experience makes the least weekly average income (324 dollars)

if(require(rattle)){fancyRpartPlot(Tree_Model1,main="TREE MODEL ON USWAGES DATASET")} #A plot of the first tree

Figure 5.1: Tree Model on ‘uswages’ Dataset

RMSE of 456.935 was associated with the model when the its performance was evaluated on the test data.

Pred_CP=predict(Tree_Model1, newdata =Tree_test_dat)
RMSE_CV.tree=sqrt(mean((Pred_CP-Tree_test_dat[,"wage"])^2))
RMSE_CV.tree  #The RMSE associated with the model when pruned on minbucket of 50 and cp of 0.01 is 456.935

## [1] 456.935

New_CP=data.frame(Pred_CP,Tree_test_dat[,"wage"])
ggplot(New_CP)+geom_point(aes(x=Pred_CP,y=Tree_test_dat[,"wage"]),color="brown")+theme_bw()+xlab("Predicted Values")+ylab("Wage (Observed Values")+
  theme(panel.grid = element_blank())   #A plot of the predicted values made on the model Vs the observed values in the test data

Figure 5.2: A Plot of Observed Weekly Wages Vs Tree Predicted Weekly Wages

In building the second model, cv.tree () function was used to see whether pruning the tree will improve the model’s performance.

Tree_Model2 <- tree(wage ~ educ + exper + race + smsa + ne + mw + so + we + pt, Tree_train_dat) #Building another tree model on the train data
cv.tree=cv.tree(Tree_Model2)  #Using the cv.tree function to see if pruning the tree will improve model performance
plot(cv.tree$size,cv.tree$dev,type="b",ylab="Mean Squared Error (MSE)",xlab="Size", main = "A plot of Means Square Error (MSE) Vs Tree Size")

Figure 5.3: A plot of Means Square Error (MSE) Vs Tree Size

#Tree size of 8 nodes has the least MSE on the model

Tree size of 8 nodes was selected by the cross validation 5.3. The tree was pruned with best=8 nodes 5.4.

Tree_Model_CV=rpart(wage~educ + exper + race + smsa + ne + mw + so + we + pt,Tree_train_dat)
Pruned_Tree=clip.rpart(Tree_Model_CV,best=8) #Building the second tree model on the train data with the rpart function and using tree size of 8 nodes
summary(Pruned_Tree)

## Call:
## rpart(formula = wage ~ educ + exper + race + smsa + ne + mw + 
##     so + we + pt, data = Tree_train_dat)
##   n= 1400 
## 
##           CP nsplit rel error    xerror      xstd
## 1 0.08141626      0 1.0000000 1.0006949 0.1079165
## 2 0.05494877      1 0.9185837 0.8906745 0.1032752
## 3 0.04037137      2 0.8636350 0.8629762 0.1028611
## 4 0.01798558      3 0.8232636 0.8575461 0.1018551
## 5 0.01507677      4 0.8052780 0.8486221 0.1010574
## 6 0.01421220      6 0.7751245 0.8423852 0.1006741
## 7 0.01035463      7 0.7609123 0.8254918 0.1006003
## 
## Variable importance
## exper  educ    pt  smsa    we 
##    44    37     9     6     4 
## 
## Node number 1: 1400 observations,    complexity param=0.08141626
##   mean=609.0972, MSE=194781.2 
##   left son=2 (1042 obs) right son=3 (358 obs)
##   Primary splits:
##       educ  < 15.5 to the left,  improve=0.08141626, (0 missing)
##       exper < 6.5  to the left,  improve=0.08031668, (0 missing)
##       pt    < 0.5  to the right, improve=0.06714110, (0 missing)
##       smsa  < 0.5  to the left,  improve=0.02506221, (0 missing)
##       race  < 0.5  to the right, improve=0.01012986, (0 missing)
## 
## Node number 2: 1042 observations,    complexity param=0.05494877
##   mean=535.2835, MSE=137614.8 
##   left son=4 (254 obs) right son=5 (788 obs)
##   Primary splits:
##       exper < 8.5  to the left,  improve=0.104496200, (0 missing)
##       pt    < 0.5  to the right, improve=0.063553440, (0 missing)
##       smsa  < 0.5  to the left,  improve=0.023823630, (0 missing)
##       educ  < 13.5 to the left,  improve=0.007382548, (0 missing)
##       race  < 0.5  to the right, improve=0.007300133, (0 missing)
##   Surrogate splits:
##       pt < 0.5  to the right, agree=0.769, adj=0.051, (0 split)
## 
## Node number 3: 358 observations,    complexity param=0.04037137
##   mean=823.9403, MSE=299154.9 
##   left son=6 (86 obs) right son=7 (272 obs)
##   Primary splits:
##       exper < 6.5  to the left,  improve=0.102794400, (0 missing)
##       pt    < 0.5  to the right, improve=0.100464100, (0 missing)
##       educ  < 17.5 to the left,  improve=0.040051080, (0 missing)
##       mw    < 0.5  to the right, improve=0.007169756, (0 missing)
##       we    < 0.5  to the left,  improve=0.006666140, (0 missing)
##   Surrogate splits:
##       pt < 0.5  to the right, agree=0.777, adj=0.07, (0 split)
## 
## Node number 4: 254 observations
##   mean=324.0665, MSE=54694.06 
## 
## Node number 5: 788 observations,    complexity param=0.0142122
##   mean=603.3662, MSE=145327.6 
##   left son=10 (227 obs) right son=11 (561 obs)
##   Primary splits:
##       smsa  < 0.5  to the left,  improve=0.03384250, (0 missing)
##       educ  < 13.5 to the left,  improve=0.02536193, (0 missing)
##       exper < 46.5 to the right, improve=0.02204489, (0 missing)
##       pt    < 0.5  to the right, improve=0.01827383, (0 missing)
##       so    < 0.5  to the right, improve=0.01217465, (0 missing)
## 
## Node number 6: 86 observations
##   mean=512.0743, MSE=107946 
## 
## Node number 7: 272 observations,    complexity param=0.01798558
##   mean=922.545, MSE=319136.3 
##   left son=14 (13 obs) right son=15 (259 obs)
##   Primary splits:
##       pt    < 0.5  to the right, improve=0.056500800, (0 missing)
##       exper < 31.5 to the right, improve=0.028448230, (0 missing)
##       educ  < 17.5 to the left,  improve=0.027492650, (0 missing)
##       we    < 0.5  to the left,  improve=0.009313403, (0 missing)
##       race  < 0.5  to the right, improve=0.006714050, (0 missing)
## 
## Node number 10: 227 observations
##   mean=493.1174, MSE=72509.47 
## 
## Node number 11: 561 observations
##   mean=647.9766, MSE=167883.9 
## 
## Node number 14: 13 observations
##   mean=323.1769, MSE=44534.74 
## 
## Node number 15: 259 observations,    complexity param=0.01507677
##   mean=952.6291, MSE=313982.9 
##   left son=30 (161 obs) right son=31 (98 obs)
##   Primary splits:
##       educ  < 17.5 to the left,  improve=0.043858640, (0 missing)
##       exper < 11.5 to the left,  improve=0.037247210, (0 missing)
##       we    < 0.5  to the left,  improve=0.013604680, (0 missing)
##       mw    < 0.5  to the right, improve=0.007814959, (0 missing)
##       race  < 0.5  to the right, improve=0.006922244, (0 missing)
## 
## Node number 30: 161 observations,    complexity param=0.01507677
##   mean=861.0744, MSE=292163.8 
##   left son=60 (66 obs) right son=61 (95 obs)
##   Primary splits:
##       exper < 13.5 to the left,  improve=0.098983620, (0 missing)
##       we    < 0.5  to the left,  improve=0.017242090, (0 missing)
##       race  < 0.5  to the right, improve=0.007541222, (0 missing)
##       smsa  < 0.5  to the left,  improve=0.006761441, (0 missing)
##       mw    < 0.5  to the right, improve=0.006211023, (0 missing)
##   Surrogate splits:
##       mw < 0.5  to the right, agree=0.596, adj=0.015, (0 split)
## 
## Node number 31: 98 observations
##   mean=1103.04, MSE=313434.1 
## 
## Node number 60: 66 observations
##   mean=657.0488, MSE=98995.22 
## 
## Node number 61: 95 observations
##   mean=1002.819, MSE=377354.3

It is observed in Figure 5.4 that the highest average weekly wages (1103 dollars) is observed among individuals with 18 or more years of education, working on part time and has more than 6.5 years of work experience whereas individuals with education level between 16 and 18 years, with more than 6.5 years of work experience and not working part time earns average weekly income of 323 dollars which is about three times less than the former but comparable to the average weekly income of individuals with less than 16 years of education and having less than 8.5 years of experience. It is also observed that individuals with the same level of education and work experience have different weekly earnings because of differences in the metropolitan areas they live. Thus, individuals with less than 16 years of education and has more than 8.5 years of experience but lives in standard metropolitan statistical area makes average weekly income of 648 dollars which is 155 dollars more than that of individuals with the same characteristics but not living in standard metropolitan statistical area.

if(require(rattle)){fancyRpartPlot(Pruned_Tree,main="TREE MODEL ON USWAGES DATASET")} #A plot of the second tree

Figure 5.4: Pruned Tree Model on ‘uswages’ Dataset

RMSE of 452.9443 was associated with the resulting model when its performance was evaluated on the test data.

Pred_values=predict(Pruned_Tree, newdata =Tree_test_dat) #Testing the model on the test data
RMSE_Pruned_Tree=sqrt(mean((Pred_values-Tree_test_dat[,"wage"])^2))
RMSE_Pruned_Tree #The RMSE associated with the model when pruned on tree size of 8 nodes is 452.9443

## [1] 452.9443

Export2=data.frame(Observed_Wages=Tree_test_dat[,"wage"],Predicted_Wages=Pred_values)
datatable(Export2,filter = "top",rownames = F)   #Scoring dataset for the Tree regression model on the data

New_dat=data.frame(Pred_values,Tree_test_dat[,"wage"])
ggplot(New_dat)+geom_point(aes(x=Pred_values,y=Tree_test_dat[,"wage"]),color="brown")+theme_bw()+xlab("Predicted Values")+ylab("Wage (Observed Values")+
  theme(panel.grid = element_blank())  #A plot of the predicted values made on the model Vs the observed values in the test data

Figure 5.5: A Plot of Observed Weekly Wages Vs Tree (Pruned) Predicted Weekly Wages

6 Conclusion

The OLD linear regression model with RMSE OF 452.7986 performed better in predicting weekly wages of individuals given the predictors in the dataset. However, its performance was comparable to the pruned tree model with RMSE of 452.9443.

ANALYSIS ON USWAGES DATASET

Samuel Adjei

04/15/2021