url <- "http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
names <- c("age","workclass","fnlwgt","education","education-num","marital-status","occupation","relationship","race","sex","capital-gain","capital-loss","hours-per-week","native-country", "salary")
xx <- readr::read_csv(url,col_names=names)
## Rows: 32561 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): workclass, education, marital-status, occupation, relationship, rac...
## dbl (6): age, fnlwgt, education-num, capital-gain, capital-loss, hours-per-week
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#View(xx)
#1 Load the data and ensure the labels are correct. You are working to develop a model that can predict age.
data <- xx
str(data)
## spec_tbl_df [32,561 × 15] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:32561] 39 50 38 53 28 37 49 52 31 42 ...
## $ workclass : chr [1:32561] "State-gov" "Self-emp-not-inc" "Private" "Private" ...
## $ fnlwgt : num [1:32561] 77516 83311 215646 234721 338409 ...
## $ education : chr [1:32561] "Bachelors" "Bachelors" "HS-grad" "11th" ...
## $ education-num : num [1:32561] 13 13 9 7 13 14 5 9 14 13 ...
## $ marital-status: chr [1:32561] "Never-married" "Married-civ-spouse" "Divorced" "Married-civ-spouse" ...
## $ occupation : chr [1:32561] "Adm-clerical" "Exec-managerial" "Handlers-cleaners" "Handlers-cleaners" ...
## $ relationship : chr [1:32561] "Not-in-family" "Husband" "Not-in-family" "Husband" ...
## $ race : chr [1:32561] "White" "White" "White" "Black" ...
## $ sex : chr [1:32561] "Male" "Male" "Male" "Male" ...
## $ capital-gain : num [1:32561] 2174 0 0 0 0 ...
## $ capital-loss : num [1:32561] 0 0 0 0 0 0 0 0 0 0 ...
## $ hours-per-week: num [1:32561] 40 13 40 40 40 40 16 45 50 40 ...
## $ native-country: chr [1:32561] "United-States" "United-States" "United-States" "United-States" ...
## $ salary : chr [1:32561] "<=50K" "<=50K" "<=50K" "<=50K" ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. workclass = col_character(),
## .. fnlwgt = col_double(),
## .. education = col_character(),
## .. `education-num` = col_double(),
## .. `marital-status` = col_character(),
## .. occupation = col_character(),
## .. relationship = col_character(),
## .. race = col_character(),
## .. sex = col_character(),
## .. `capital-gain` = col_double(),
## .. `capital-loss` = col_double(),
## .. `hours-per-week` = col_double(),
## .. `native-country` = col_character(),
## .. salary = col_character()
## .. )
## - attr(*, "problems")=<externalptr>
#View(data)
#2 Ensure all the variables are classified correctly including the target variable and collapse factors if still needed.
data <- data[,-3]
data <- data[,-11]
data <- data[,-10]
table(data$`marital-status`)
##
## Divorced Married-AF-spouse Married-civ-spouse
## 4443 23 14976
## Married-spouse-absent Never-married Separated
## 418 10683 1025
## Widowed
## 993
data$`marital-status` <- fct_collapse(data$`marital-status`,
Not.Married=c("Never-married", "Divorced", "Separated", "Widowed"),
Married=c("Married-civ-spouse","Married-AF-spouse", "Married-spouse-absent"))
data$`native-country` <- fct_collapse(data$`native-country`,
North_America = c("Canada", "United-States", "Mexico"),
South_America = c("Columbia", "Ecuador", "Peru"),
CA_Carrib = c("Cuba", "Dominican-Republic", "El-Salvador", "Guatemala", "Haiti", "Honduras", "Jamaica", "Nicaragua", "Puerto-Rico", "Trinadad&Tobago", "Outlying-US(Guam-USVI-etc)"),
Europe = c("England", "France", "Germany", "Greece", "Holand-Netherlands", "Hungary", "Ireland", "Italy", "Portugal", "Scotland", "Yugoslavia"),
Asia = c("Cambodia", "China", "Hong", "India", "Iran", "Japan", "Laos", "South", "Taiwan", "Thailand", "Vietnam"),
Unknown = c("?")
)
data$`hours-per-week` <- as.character(data$`hours-per-week`)
pt <- as.character(c(1:39))
ot <- as.character(c(41:68, 70, 72:78, 80:82, 84:92, 94:97))
data$`hours-per-week` <- fct_collapse(data$`hours-per-week`,
part_time = pt,
full_time = "40",
overtime = ot
)
data$`education` <- fct_collapse(data$`education`,
Pre_Elem = c("Preschool", "1st-4th"),
Middle = c("5th-6th", "7th-8th"),
HS_nograd = c("9th", "10th", "11th", "12th"),
HS_grad = c("HS-grad"),
Some_college = c("Some-college"),
Assoc_acdm = c("Assoc-acdm"),
Assoc_voc = c("Assoc-voc"),
Prof_school = c("Prof-school"),
Bachelors = c("Bachelors"),
Masters = c("Masters"),
Doctorate = c("Doctorate")
)
data$workclass <- fct_collapse(data$workclass,
Unknown = c("?"))
data$occupation <- fct_collapse(data$occupation,
Unknown = c("?"))
data$salary <- fct_collapse(data$salary,
Low = c("<=50K"),
High = c(">50K"))
# Target Variable
table(data$age)
##
## 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
## 395 550 712 753 720 765 877 798 841 785 835 867 813 861 888 828 875 886 876 898
## 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56
## 858 827 816 794 808 780 770 724 734 737 708 543 577 602 595 478 464 415 419 366
## 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76
## 358 366 355 312 300 258 230 208 178 150 151 120 108 89 72 67 64 51 45 46
## 77 78 79 80 81 82 83 84 85 86 87 88 90
## 29 23 22 22 20 12 6 10 3 1 1 3 43
#3 Check for missing variables and correct as needed.
sum(is.na(data))
## [1] 0
sapply(data, function(x) sum(is.na(x)))
## age workclass education education-num marital-status
## 0 0 0 0 0
## occupation relationship race sex hours-per-week
## 0 0 0 0 0
## native-country salary
## 0 0
#6 Split your data into test, tune, and train. (80/10/10)
part_index_1 <- caret::createDataPartition(data$age,
times=1,
p = 0.80,
groups=1,
list=FALSE)
train <- data[part_index_1, ]
tune_and_test <- data[-part_index_1, ]
tune_and_test_index <- createDataPartition(tune_and_test$age,
p = .5,
list = FALSE,
times = 1)
tune <- tune_and_test[tune_and_test_index, ]
test <- tune_and_test[-tune_and_test_index, ]
dim(train)
## [1] 26049 12
dim(tune)
## [1] 3257 12
dim(test)
## [1] 3255 12
#7 Build your model using the training data, rpart2, and repeated cross validation as reviewed in class with the caret package.
features <- train[,-1]#dropping 12 because it's target variable.
target <- train$age
str(features)
## tibble [26,049 × 11] (S3: tbl_df/tbl/data.frame)
## $ workclass : Factor w/ 9 levels "Unknown","Federal-gov",..: 8 5 5 5 5 5 7 5 5 8 ...
## $ education : Factor w/ 11 levels "HS_nograd","Pre_Elem",..: 6 8 1 6 9 1 8 9 6 6 ...
## $ education-num : num [1:26049] 13 9 7 13 14 5 9 14 13 13 ...
## $ marital-status: Factor w/ 2 levels "Not.Married",..: 1 1 2 2 2 2 2 1 2 2 ...
## $ occupation : Factor w/ 15 levels "Unknown","Adm-clerical",..: 2 7 7 11 5 9 5 11 5 11 ...
## $ relationship : chr [1:26049] "Not-in-family" "Not-in-family" "Husband" "Wife" ...
## $ race : chr [1:26049] "White" "White" "Black" "Black" ...
## $ sex : chr [1:26049] "Male" "Male" "Male" "Female" ...
## $ hours-per-week: Factor w/ 5 levels "part_time","full_time",..: 2 2 2 2 2 1 3 3 2 2 ...
## $ native-country: Factor w/ 8 levels "Unknown","Asia",..: 3 3 3 5 3 5 3 3 3 2 ...
## $ salary : Factor w/ 2 levels "Low","High": 1 1 1 1 1 1 2 2 2 2 ...
str(target)
## num [1:26049] 39 38 53 28 37 49 52 31 42 30 ...
fitControl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 5,
)
tree.grid <- expand.grid(maxdepth=c(5,7,9,11))
set.seed(2001)
age_mdl <- train(x=features,
y=target,
method="rpart2",
trControl=fitControl,
metric="RMSE")
set.seed(2001)
age_mdl_1 <- train(x=features,
y=target,
method="rpart2",#type of model uses maxdepth to select a model
trControl=fitControl,#previously created
tuneGrid=tree.grid,#expanded grid
metric="RMSE")#selected on of the metrics available from two variable summary.
#6 View the results, comment on how the model performed and which variables appear to be contributing the most (variable importance)
age_mdl
## CART
##
## 26049 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 23445, 23445, 23442, 23445, 23444, 23445, ...
## Resampling results across tuning parameters:
##
## maxdepth RMSE Rsquared MAE
## 1 12.30695 0.1849088 9.808514
## 2 12.04335 0.2194914 9.598679
## 3 11.91762 0.2357238 9.473275
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was maxdepth = 3.
age_mdl_1
## CART
##
## 26049 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 23445, 23445, 23442, 23445, 23444, 23445, ...
## Resampling results across tuning parameters:
##
## maxdepth RMSE Rsquared MAE
## 5 11.90661 0.2371054 9.462976
## 7 11.90661 0.2371054 9.462976
## 9 11.90661 0.2371054 9.462976
## 11 11.90661 0.2371054 9.462976
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was maxdepth = 5.
varImp(age_mdl)
## rpart2 variable importance
##
## Overall
## relationship 100.000
## marital-status 62.646
## workclass 35.136
## salary 32.243
## education 28.845
## occupation 8.579
## education-num 4.356
## `hours-per-week` 0.000
## race 0.000
## `education-num` 0.000
## `native-country` 0.000
## `marital-status` 0.000
## sex 0.000
When examining our model, it seems as though our R squared value remains very low and that our model is not super effective at predicting the age variable. This is because R squared represents the amount of variability in the age variable that can be predicted by the model. Furthermore, when examining our variables of importance, it is clear that the relationship column has the largest impact on age. Furthermore, marital-status, education, workclass, and salary all appear to contribute substantially to the model, with a few other variables being relevant to our predictions.
#7 Plot the output of the model to see the tree visually, using rpart.plot, is there anything you notice that might be a concern?
plot(age_mdl)
rpart.plot(age_mdl$finalModel, type=4,extra=101)
The biggest concern that is easily apparent from our model is that our maximum depth is of 3. While this means that our model is simple and quick, the lack of complexity makes it difficult to yield more accurate results. Furthermore, most of the tree seems to rely on the relationship variable, meaning that there are not a lot of variables that can help contribute significantly to our predictions.
#8 Use the tune set and the predict function with your model to make predicts for the target variable.
age_pred_tune_r = predict(age_mdl,tune)
#View(as_tibble(age_pred_tune_r))
#9 Use the postResample function to get your evaluation metrics. Also calculate NRMSE using the range (max-min) for the target variable. Explain what all these measures mean in terms of your models predictive power.
postResample(pred = age_pred_tune_r, obs = tune$age)
## RMSE Rsquared MAE
## 12.0897355 0.2286903 9.5245276
range(tune$age)
## [1] 17 90
90-17
## [1] 73
11.86/73
## [1] 0.1624658
We can first see that our model has a large range for the age variable. When we examine the metrics, we know that RMSE (root mean square deviation) measures the quality of the fit of our model for the age variable. Because this value is high, this means that currently our model is not a good predictor of the variable. However, when we examine our NRMSE (normalized), which considers the range of our target variable, we see that our model actually is somewhat effective and has lower variance when we consider the characteristics of the target variable. This means that when we have normalized our RMSE, our model is not as ineffective as first thought to be (NRMSE = .162) because of NRMSE’s lower value.
#10 Based on your understanding of the model anrd data adjust the hyper-parameter via the built in train control function in caret or build and try new features, does the model quality improve or not? If so how and why, if not, why not?
age_mdl_new <- train(x=features,
y=target,
method="rpart2",
trControl=fitControl,
tuneGrid=tree.grid,
metric="RMSE")
rpart.plot(age_mdl_new$finalModel, type = 5, extra=101)
age_mdl_new
## CART
##
## 26049 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 23444, 23444, 23445, 23444, 23444, 23445, ...
## Resampling results across tuning parameters:
##
## maxdepth RMSE Rsquared MAE
## 5 11.9155 0.2359274 9.4714
## 7 11.9155 0.2359274 9.4714
## 9 11.9155 0.2359274 9.4714
## 11 11.9155 0.2359274 9.4714
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was maxdepth = 5.
features$`education-num` <- as.numeric(features$`education-num`)
education_num <- cut(xx$`education-num`,c(1,10,16), labels = c("low","high"))
data_2 <- data.frame(data[,-4],education_num)
#Here I featured education-num, to make it more collapsed and to just devide the whole data.
set.seed(2001)
part_index_1 <- caret::createDataPartition(data_2$age,
times=1,
p = 0.80,
groups=1,
list=FALSE)
train_1 <- data_2[part_index_1, ]
tune_and_test_1 <- data_2[-part_index_1, ]
tune_and_test_index <- createDataPartition(tune_and_test_1$salary,
p = .5,
list = FALSE,
times = 1)
tune_1 <- tune_and_test_1[tune_and_test_index, ]
test_1 <- tune_and_test_1[-tune_and_test_index, ]
features_1 <- train_1[,-1]#dropping 12 because it's target variable.
target_1 <- train_1$age
fitControl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 5,
)
tree.grid <- expand.grid(maxdepth=c(3,5,7,9,11))
set.seed(2001)
age_mdl_new2 <- train(x=features_1,
y=target_1,
method="rpart2",
trControl=fitControl,
tuneGrid=tree.grid,#expanded grid
metric="RMSE")
age_mdl_new2
## CART
##
## 26049 samples
## 11 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 23445, 23445, 23444, 23444, 23443, 23444, ...
## Resampling results across tuning parameters:
##
## maxdepth RMSE Rsquared MAE
## 3 11.91780 0.2357938 9.457731
## 5 11.89143 0.2391992 9.440919
## 7 11.89143 0.2391992 9.440919
## 9 11.89143 0.2391992 9.440919
## 11 11.89143 0.2391992 9.440919
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was maxdepth = 5.
The model is improving based on the changes we make to the contributing variables of importance and different depth of the tree, but the improvements are not significant. It appears that our values of RMSE, Rsquared, and MAE all remian similar regardless of attempted changes.
#11 Once you are confident that your model is not improving, via changes implemented on the training set and evaluated on the the tune set, predict with the test set and report final evaluation of the model. Discuss the output in comparison with the previous evaluations.
age_pred_tune_r2 = predict(age_mdl_new2,test_1)
postResample(pred = age_pred_tune_r2, obs = test_1$age)
## RMSE Rsquared MAE
## 11.9263436 0.2545489 9.4554681
range(test_1$age)
## [1] 17 90
90-17
## [1] 73
11.92/73
## [1] 0.1632877
When examining our new output for the test case, we see that our Rsquared value is higher than it was in our initial evaluation. Our RMSE value and NRMSE value are both within .01 of each other, meaning that although we made a few improvements to our model, the results did not change significantly. For both the initial and final output, we see that the initial RMSE is very high, but the NRMSE takes in the range for both ouputs and gives us a normalized value.
#12 Summarize what you learned along the way and make recommendations on how this could be used moving forward, being careful not to over promise.
When considering our data set and model, we see that our target variable of age has a large range. This large range impacts our model’s performance, and we see the change in root mean square deviation after we normalize it based on the range of the age variable. As a result, we can say after normalizing that our model is somewhat effective at predicting age due to its fit and NRMSE value, but should not be relied upon to make significant decisions and predictions, especially due to its low Rsquared value.
#13 What was the most interesting or hardest part of this process and what questions do you still have?
I think the hardest part of this process was once again trying to make improvements to the model through feature engineering. It is hard to understand what substantial steps can be taken to improve a model, especially when there is a lot of work that needs to be done for any change to see the results. I would like to learn more about how to improve RMSE and Rsquared values for our model, because I feel like the steps I take currently are more based on guessing and checking for improvements, rather than knowledgeable decisions.