library(ggplot2)
library(tidyverse)
library(graphics)
library(mdsr) # install package if not installed
library(discrim) # install package if not installed
library(klaR) # install package if not installed
library(kknn) # install package if not installed
library(utils) # install package if not installed
library(sp) # install package if not installed
library(fs)
Note: If you Rmd file submission knits
you will receive total of (5 points Extra Credit)
Directions: Complete Task 1 and one of the Task 2 or 3!
A marketing analyst might be interested in finding factors that can
be used to predict whether a potential customer is a high-earner. The
1994 United States Census provides information that can
inform such a model, with records from 32,561 adults that
include a binary variable indicating whether each person makes greater
or less than $50,000 (more than $80,000 today
after accounting for inflation). This is our response variable.
library(tidyverse)
library(mdsr)
url <-
"http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
census <- read_csv(
url,
col_names = c(
"age", "workclass", "fnlwgt", "education",
"education_1", "marital_status", "occupation", "relationship",
"race", "sex", "capital_gain", "capital_loss", "hours_per_week",
"native_country", "income"
)
) %>%
mutate(income = factor(income), income_ind = as.numeric(income == ">50K")) # create indicator variable income_ind (0 - low, 1 - high earner)
Rows: 32561 Columns: 15── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (9): workclass, education, marital_status, occupation, relationship, race, sex, native_country, income
dbl (6): age, fnlwgt, education_1, capital_gain, capital_loss, hours_per_week
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# look at the structure of the data
glimpse(census)
Rows: 32,561
Columns: 16
$ age <dbl> 39, 50, 38, 53, 28, 37, 49, 52, 31, 42, 37, 30, 23, 32, 40, 34, 25, 32, 38, 43, 40, 54, 35, 43, 59, 56, 19, 54, 39, 49, 23, 20, 45, 30, 22, 4…
$ workclass <chr> "State-gov", "Self-emp-not-inc", "Private", "Private", "Private", "Private", "Private", "Self-emp-not-inc", "Private", "Private", "Private", …
$ fnlwgt <dbl> 77516, 83311, 215646, 234721, 338409, 284582, 160187, 209642, 45781, 159449, 280464, 141297, 122272, 205019, 121772, 245487, 176756, 186824, …
$ education <chr> "Bachelors", "Bachelors", "HS-grad", "11th", "Bachelors", "Masters", "9th", "HS-grad", "Masters", "Bachelors", "Some-college", "Bachelors", "…
$ education_1 <dbl> 13, 13, 9, 7, 13, 14, 5, 9, 14, 13, 10, 13, 13, 12, 11, 4, 9, 9, 7, 14, 16, 9, 5, 7, 9, 13, 9, 10, 9, 9, 12, 10, 13, 10, 10, 7, 10, 9, 10, 12…
$ marital_status <chr> "Never-married", "Married-civ-spouse", "Divorced", "Married-civ-spouse", "Married-civ-spouse", "Married-civ-spouse", "Married-spouse-absent",…
$ occupation <chr> "Adm-clerical", "Exec-managerial", "Handlers-cleaners", "Handlers-cleaners", "Prof-specialty", "Exec-managerial", "Other-service", "Exec-mana…
$ relationship <chr> "Not-in-family", "Husband", "Not-in-family", "Husband", "Wife", "Wife", "Not-in-family", "Husband", "Not-in-family", "Husband", "Husband", "H…
$ race <chr> "White", "White", "White", "Black", "Black", "White", "Black", "White", "White", "White", "Black", "Asian-Pac-Islander", "White", "Black", "A…
$ sex <chr> "Male", "Male", "Male", "Male", "Female", "Female", "Female", "Male", "Female", "Male", "Male", "Male", "Female", "Male", "Male", "Male", "Ma…
$ capital_gain <dbl> 2174, 0, 0, 0, 0, 0, 0, 0, 14084, 5178, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ capital_loss <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2042, 0, 0, 0, 0, 0, 0, 0, 0, 1408, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
$ hours_per_week <dbl> 40, 13, 40, 40, 40, 40, 16, 45, 50, 40, 80, 40, 30, 50, 40, 45, 35, 40, 50, 45, 60, 20, 40, 40, 40, 40, 40, 60, 80, 40, 52, 44, 40, 40, 15, 4…
$ native_country <chr> "United-States", "United-States", "United-States", "United-States", "Cuba", "United-States", "Jamaica", "United-States", "United-States", "Un…
$ income <fct> <=50K, <=50K, <=50K, <=50K, <=50K, <=50K, <=50K, >50K, >50K, >50K, >50K, >50K, <=50K, <=50K, >50K, <=50K, <=50K, <=50K, <=50K, >50K, >50K, <=…
$ income_ind <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, …
a) (10 pts) Split the data set into two pieces by
separating the rows at random. A sample of 70% of the rows will become
the training data set, with the remaining 30% set aside as
the testing (or “hold-out”) data set. Use
set.seed(364) in the beginning of your code. How many
records are in the testing set?
a) 9769
Hint: One possible way to do this is to use the
function initial_split(prop = 0.7) and call the functions
training() and testing().
library(tidymodels)
set.seed(364)
n <- nrow(census)
census_parts <- census %>%
initial_split(prop = 0.7)
train <- census_parts %>% training()
test <- census_parts %>% testing()
nrow(test)
[1] 9769
Note: You should get around 24% of those in the
sample make more than $50k. Thus, the accuracy of the
null model is about 76%, since we can get that
many right by just predicting that everyone makes less than
$50k.
# if your training set is called `train` this code will produce the correct percentage
pi_bar <- train %>%
count(income) %>%
mutate(pct = n / sum(n)) %>%
filter(income == ">50K") %>%
pull(pct)
print(c("Percent >50K", pi_bar))
[1] "Percent >50K" "0.241751491751492"
Pro Tip: Always benchmark your predictive models against a reasonable null model.
b) (10 pts) Use KNN algorithm to
classify the earners (<=50K, >50K, low/high) for
High-earners in the 1994 United States Census data above. Select only
the quantitative variables
age,education_1, capital_gain, capital_loss, hours_per_week.
Use mode = "classification" and k=1(use the
closest neighbor) in the nearest_neighbor function
arguments. Print the confusion matrix. State the accuracy.
The Accuracy is 0.8492453 or 84.92453% ~ 85%
Hint: See Programming exercises Week 13 for details
about KNN implementation.
library(kknn)
# distance metric only works with quantitative variables, saved them in train_q set
train_q <- train %>% dplyr::select(income, age, education_1, capital_gain, capital_loss, hours_per_week)
# define knn classifier
mod_knn <- nearest_neighbor(neighbors = 5, mode = "classification") %>%
set_engine("kknn", scale = TRUE) %>%
fit(income ~ ., data = train_q)
# predict the income using the knn classifier saved in new column called `income_knn`
pred <- train_q %>%
bind_cols(
predict(mod_knn, new_data = train_q, type = "class")
) %>%
rename(income_knn = .pred_class)
# print the confusion matrix
pred %>% conf_mat(income, income_knn)
Truth
Prediction <=50K >50K
<=50K 15843 1997
>50K 1439 3513
print(pred)
NA
NA
Accuracy is:
# Find the Accuracy = (true positive and true negative)/total or use the `accuracy()` function.
pred %>%
accuracy(income, income_knn)
print(pred)
NA
Rform <- as.formula(
"income ~ age + workclass + education + marital_status +
occupation + relationship + race + sex +
capital_gain + capital_loss + hours_per_week"
)
form
income ~ age + workclass + education + marital_status + occupation +
relationship + race + sex + capital_gain + capital_loss +
hours_per_week
c) (10 pts) Build naive Bayes classifier and compute
its accuracy using the formula given above, form.
income ~ age + workclass + education + marital.status + occupation + relationship + race + sex + capital.gain + capital.loss + hours.per.week
Hint: See Programming exercises Week 13 for details
about naive Bayes classifier implementation.
library(discrim)
# create naiveBayes classifier
pred <- naiveBayes(mode = "classification") %>% set_engine("klaR") %>% fit(form, data = train) %>% (formula = income ~ age + workclass + education + marital.status + occupation + relationship + race + sex + capital.gain + capital.loss + hours.per.week)
# use the predict method with the mod_nb model
pred <- train %>%
bind_cols(
predict(mod_nb, new_data = train, type = "class")
) %>%
rename(income_nb = .pred_class)
# confusion matrix
pred %>% conf_mat(income, income_nb)
Truth
Prediction <=50K >50K
<=50K 16965 3949
>50K 317 1561
# accuracy
pred %>% accuracy(income, income_nb)
NA
d) (10 pts) Use logistic regression to model the
probability of high earners (income >50K) for
High-earners in the 1994 United States Census data above. As response
variable use the variable income_ind (0/1) created in the
data processing step at the beginning. As predictors use
age, education.num, sex, and optionally
marital status and other variables if you want. To review
the usefulness of variables inspect the plots below.
Hint: See Programming exercises Week 13 for details
about logit model implementation.
# create plot of income_ind vs age/education.num/sex/marital status
log_plot <- ggplot(data = census, aes(x = age, y = income_ind)) +
geom_jitter(alpha = 0.1, height = 0.05) +
geom_smooth(method = "glm", method.args = list(family = "binomial")) +
ylab("Earner Status")
log_plot + xlab("Age (in years)")
log_plot + aes(x = education_1) +
xlab("Education level (1-16, 16 is the highest)")
log_plot + aes(x = sex) +
xlab("Gender")
log_plot + aes(x = marital_status) +
xlab("Marital Status")
Q: Which variables appear to be important: ?
Based on the deviance and intercepts the more important variables are age, sexMale and education1
Use a logistic regression model to model the probability of high income as a function of all chosen predictors.
Use the glm() function by setting the
family = binomial - for dichotomous outcomes
logreg <- glm(income_ind ~ age + sex + education_1, family = "binomial", data = train)
tidy(logreg)
print(logreg)
Call: glm(formula = income_ind ~ age + sex + education_1, family = "binomial",
data = train)
Coefficients:
(Intercept) age sexMale education_1
-7.67469 0.04209 1.30401 0.36542
Degrees of Freedom: 22791 Total (i.e. Null); 22788 Residual
Null Deviance: 25210
Residual Deviance: 20230 AIC: 20240
FYI: The predicted probabilities and predicted
values for income_ind can be found using the code,
uncomment the code lines to use (highlight chink and press CTRL
+ SHIFT + C):
# use the predict method with the logreg model, below are predicted probability
logit_pred_prob <- predict(logreg, newdata = train, type = "response")
# assign 1/0 based on logit_pred_prob > 0.5. This is predicted high-earner status "yes". You can define different cutoff value if preferred.
pred <- as.numeric(logit_pred_prob > 0.5)
# confusion matrix
confusion_matrix <- table(pred_y, train$income_ind)
# confusion
# accuracy
mean(pred == train$income_ind, na.rm = TRUE)
[1] 0.7943577
e) (10 pts) Assessing the Logit model from
part d) using the test set saved in test
R-object.
What is the accuracy of the model?
# use the predict method with the logreg model, below are predicted probability
logit_pred_prob <- predict(logreg, newdata = census, type = "response")
# assign 1/0 based on logit_pred_prob > 0.5. This is predicted high-earner status "yes". You can define different cutoff value if preferred.
logit_pred_class <- ifelse(logit_pred_prob > 0.5, 1, 0)
# confusion matrix
confusion_matrix <- table(logit_pred_class, census$income_ind)
# accuracy
accuracy <- sum(diag(confusion_matrix)) / sum(confusion_matrix)
print(confusion_matrix)
logit_pred_class 0 1
0 23162 5059
1 1558 2782
print(accuracy)
[1] 0.7967814
f) (10 pts) Which one of the classification models
achieved the highest accuracy? f) The KNN algorithm
with an accuracy of 0.8492453
Let us consider the unsupervised learning process of identifying
different types of cars. The United States Department of Energy
maintains automobile characteristics for thousands of cars:
miles per gallon, engine size,
number of cylinders, number of gears, etc.
Please see their guide for more information. Here, we download a ZIP file from their website that contains fuel economy rating for the 2016 model year.
Next, we use the readxl package to read this file into
R, clean up some of the resulting variable names, select a
small subset of the variables, and filter for distinct models of Toyota
vehicles. The resulting data set contains information about
75 different models that Toyota produces. Store the data
file “2016 FEGuide.xlsx” in a subfolder by the name ‘data’ in your
working directory.
Note: You may need to adjust the code below to specify the “2016 FEGuide.xlsx” file location if you opt out to store the data file in different location.
# load the readxl package to read the xlsx file in R
library(readxl)
#Imported X2016_FEGuide by downloading and then importing data set. No need for 'filename' and read_excel function
# use read_excel function to read the file by using the path stored in the filename
cars <- X2016_FEGuide %>%
janitor::clean_names() %>%
dplyr::select(
make = mfr_name,
model = carline,
displacement = eng_displ,
number_cyl,
number_gears,
city_mpg = city_fe_guide_conventional_fuel,
hwy_mpg = hwy_fe_guide_conventional_fuel
) %>%
distinct(model, .keep_all = TRUE) %>%
filter(make == "Toyota") # filter Toyota vehicles only
# have a look at the data
glimpse(cars)
Rows: 75
Columns: 7
$ make <chr> "Toyota", "Toyota", "Toyota", "Toyota", "Toyota", "Toyota", "Toyota", "Toyota", "Toyota", "Toyota", "Toyota", "To…
$ model <chr> "FR-S", "RC 200t", "RC 300 AWD", "RC 350", "RC 350 AWD", "RC F", "iA", "CT 200h", "GS F", "IS 200t", "IS 300 AWD"…
$ displacement <dbl> 2.0, 2.0, 3.5, 3.5, 3.5, 5.0, 1.5, 1.8, 5.0, 2.0, 3.5, 3.5, 3.5, 2.5, 1.5, 1.5, 2.5, 3.5, 2.0, 2.0, 3.5, 3.5, 3.5…
$ number_cyl <dbl> 4, 4, 6, 6, 6, 8, 4, 4, 8, 4, 6, 6, 6, 4, 4, 4, 4, 6, 4, 4, 6, 6, 6, 6, 8, 8, 8, 8, 8, 4, 6, 4, 4, 4, 4, 4, 4, 4,…
$ number_gears <dbl> 6, 8, 6, 8, 6, 8, 6, 1, 8, 8, 6, 8, 6, 6, 1, 4, 6, 6, 8, 8, 8, 6, 8, 8, 8, 8, 8, 8, 8, 7, 6, 6, 6, 1, 1, 4, 1, 1,…
$ city_mpg <dbl> 25, 22, 19, 19, 19, 16, 33, 43, 16, 22, 19, 19, 19, 23, 53, 30, 40, 21, 22, 21, 20, 19, 19, 29, 16, 16, 16, 16, 1…
$ hwy_mpg <dbl> 34, 32, 26, 28, 26, 25, 42, 40, 24, 33, 26, 28, 26, 31, 46, 36, 39, 31, 33, 30, 29, 26, 28, 34, 24, 23, 24, 23, 2…
As a large automaker, Toyota has a diverse lineup of cars, trucks, SUVs, and hybrid vehicles. Can we use unsupervised learning to categorize these vehicles in a sensible way with only the data we have been given?
For an individual quantitative variable, it is easy to measure how
far apart any two cars are: Take the difference between the numerical
values. The different variables are, however, on different scales and in
different units. For example, gears ranges only from
1 to 8, while city_mpg goes from
13 to 58. This means that some decision needs
to be made about rescaling the variables so that the differences along
each variable reasonably reflect how different the respective cars are.
There is more than one way to do this, and in fact, there is no
universally “best” solution—the best solution will always depend on the
data and your domain expertise. The dist() function takes a
simple and pragmatic point of view: Each variable is equally
important.
The output of dist() gives the distance from each
individual car to every other car.
car_diffs <- cars %>%
column_to_rownames(var = "model") %>%
dist()
Warning: NAs introduced by coercion
str(car_diffs)
'dist' num [1:2775] 4.52 11.29 9.93 11.29 15.14 ...
- attr(*, "Size")= int 75
- attr(*, "Labels")= chr [1:75] "FR-S" "RC 200t" "RC 300 AWD" "RC 350" ...
- attr(*, "Diag")= logi FALSE
- attr(*, "Upper")= logi FALSE
- attr(*, "method")= chr "euclidean"
- attr(*, "call")= language dist(x = .)
Create distance matrix object from the car_diffs
car_mat <- car_diffs %>% as.matrix()
car_mat[1:6, 1:6] %>% round(digits = 2)
#install if not installed
# install.packages("ape")
library(ape)
car_diffs %>%
hclust() %>%
as.phylo() %>%
plot(cex = 0.9, label.offset = 1)
Choose one of the car makers:
General Motors, Nissan, Ford Motor Company, Honda, Mercedes-Benz, BMW, Kia
- preferably maker that you are familiar with the models but not
necessarily.
a) (Total 20 pts) Create a tree constructed by hierarchical clustering that relates carmaker car models to one another.
Hint: You can use the code above and make necessary modification.
YOUR CODE HERE:
cars <- X2016_FEGuide %>%
janitor::clean_names() %>%
dplyr::select(
make = mfr_name,
model = carline,
displacement = eng_displ,
number_cyl,
number_gears,
city_mpg = city_fe_guide_conventional_fuel,
hwy_mpg = hwy_fe_guide_conventional_fuel
) %>%
distinct(model, .keep_all = TRUE) %>%
filter(make == "Kia") #Kia selection
car_diffs <- cars %>%
column_to_rownames(var = "model") %>%
dist()
Warning: NAs introduced by coercion
str(car_diffs)
'dist' num [1:253] 9.42 10.1 10.79 18.87 17.34 ...
- attr(*, "Size")= int 23
- attr(*, "Labels")= chr [1:23] "Forte Koup" "Rio" "Rio ECO" "Forte" ...
- attr(*, "Diag")= logi FALSE
- attr(*, "Upper")= logi FALSE
- attr(*, "method")= chr "euclidean"
- attr(*, "call")= language dist(x = .)
car_mat <- car_diffs %>% as.matrix()
car_mat[1:6, 1:6] %>% round(digits = 2)
Forte Koup Rio Rio ECO Forte Optima HYBRID Optima HYBRID EX
Forte Koup 0.00 9.42 10.10 10.79 18.87 17.34
Rio 9.42 0.00 1.10 2.46 10.43 9.08
Rio ECO 10.10 1.10 0.00 3.11 9.40 8.02
Forte 10.79 2.46 3.11 0.00 11.03 9.88
Optima HYBRID 18.87 10.43 9.40 11.03 0.00 1.55
Optima HYBRID EX 17.34 9.08 8.02 9.88 1.55 0.00
glimpse(cars)
Rows: 23
Columns: 7
$ make <chr> "Kia", "Kia", "Kia", "Kia", "Kia", "Kia", "Kia", "Kia", "Kia", "Kia", "Kia", "Kia", "Kia", "Kia", "Kia", "Kia", "…
$ model <chr> "Forte Koup", "Rio", "Rio ECO", "Forte", "Optima HYBRID", "Optima HYBRID EX", "Cadenza", "Forte 5", "K900", "Opti…
$ displacement <dbl> 1.6, 1.6, 1.6, 1.8, 2.4, 2.4, 3.3, 1.6, 3.8, 1.6, 2.0, 1.6, 2.0, 3.3, 3.3, 3.3, 2.0, 2.4, 2.0, 2.0, 3.3, 2.4, 2.0
$ number_cyl <dbl> 4, 4, 4, 4, 4, 4, 6, 4, 6, 4, 4, 4, 4, 6, 6, 6, 4, 4, 4, 4, 6, 4, 4
$ number_gears <dbl> 6, 6, 6, 6, 6, 6, 6, 6, 8, 7, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6
$ city_mpg <dbl> 22, 27, 28, 26, 36, 35, 19, 21, 17, 28, 22, 24, 24, 18, 18, 17, 20, 21, 20, 19, 18, 19, 19
$ hwy_mpg <dbl> 30, 37, 37, 39, 40, 39, 28, 29, 26, 39, 32, 30, 31, 24, 25, 22, 27, 28, 26, 25, 26, 26, 25
car_diffs %>%
hclust() %>%
as.phylo() %>%
plot(cex = 0.9, label.offset = 1)
b) (Total 20 pts) Attempt to interpret the tree, how the models in same cluster are similar and how clusters differ.
YOUR COMMENTS: When loading the variable names in the first step, we are actually ordering the heirarchy of the cluster and how it will be seperated. For example, the initial reading of the X2016_FEGuide we order the reading by make = mfr_name, model = carline, displacement = eng_displ, number_cyl, number_gears, city_mpg = city_fe_guide_conventional_fuel, and finally hwy_mpg = hwy_fe_guide_conventional_fuel. This means that the ‘families’ of the Kias are separated by the make, model etc. followed by the engine display, then the number of cycles, gears. Finally the larger groupinds are by the city miles per gallon (city_mpg) and then the highway miles per gallon (highway_mpg)