** GEOG6160 Spatial Modeling Lab 6 - Intro to Machine Learning**

Initial Setup

##pkgs
#MLR - Machine Learning
#Viz, tuning, learners are extensions of the mlr3 package

pkgs = c("mlr3", "mlr3viz", "mlr3learners", "mlr3tuning")
install.packages(pkgs)

## Set working directory
## Experiment with here()?

setwd("~/Desktop/University of Utah PhD /Course Work/Spring 2023 Semester/GEOG6160 - Spatial Modeling/Labs/lab06")

library(mlr3) ##ML
library(mlr3learners)
library(mlr3viz)
library(mlr3tuning)
library(dplyr) ##Data scrubbing/wrangling
library(sf) ##Simple features
library(raster) ##Raster package
library(tmap) ##Mapping package
library(ggplot2) ##Plotting

#library(here)

## Set here to lab 6

#set_here("~/Desktop/University of Utah PhD /Course Work/Spring 2023 Semester/GEOG6160 - Spatial Modeling/Labs/lab06")

##Housing data
housing = read.csv("../datafiles/housing.csv")

##California Border Shapefile
ca_sf = st_read("../datafiles/ca/ca.shp",
                crs = 4326)

## Reading layer `ca' from data source 
##   `/Users/davidleydet/Desktop/University of Utah PhD /Course Work/Spring 2023 Semester/GEOG6160 - Spatial Modeling/Labs/datafiles/ca/ca.shp' 
##   using driver `ESRI Shapefile'
## Simple feature collection with 1 feature and 14 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -124.482 ymin: 32.52883 xmax: -114.1312 ymax: 42.0095
## Geodetic CRS:  WGS 84

Data Pre-processing

##Check for missing (NA) values using colSums and is.na
colSums(is.na(housing))

##          longitude           latitude housing_median_age        total_rooms 
##                  0                  0                  0                  0 
##     total_bedrooms         population         households      median_income 
##                207                  0                  0                  0 
## median_house_value    ocean_proximity 
##                  0                  0

##Remove missing values using dplyr's filter function

housing = housing %>% filter(!is.na(total_bedrooms))

##Check
colSums(is.na(housing))

##          longitude           latitude housing_median_age        total_rooms 
##                  0                  0                  0                  0 
##     total_bedrooms         population         households      median_income 
##                  0                  0                  0                  0 
## median_house_value    ocean_proximity 
##                  0                  0

Data visualization

##Scale Housing Value Data
housing$median_value_scaled = (housing$median_house_value/100000)

##Simple Histogram

ggplot(data = housing, aes(x = median_value_scaled)) +
  geom_histogram(binwidth = 0.25) +
  xlab("Median House Value ($100,000)") +
  ylab("Frequency") +
  labs(title = "CA Median House Value Histogram") +
  geom_vline(aes(xintercept = mean(median_value_scaled)),
             color = "blue",
             linetype = "dashed",
             size = 1)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

##Simple Histogram

##Simple Histogram

ggplot(data = housing, aes(x = median_income)) +
  geom_histogram(binwidth = 1) +
  xlab("Median Income") +
  ylab("Frequency") +
  labs(title = "CA Median Income") +
  geom_vline(aes(xintercept = mean(median_income)),
             color = "blue",
             linetype = "dashed",
             size = 1)

##The total_rooms and total_bedrooms are the number of rooms per districts. Need to normalize these values by house

##Avg rooms per house
housing$avg_rooms = housing$total_rooms / housing$households

##Bedroom to total room ratio
housing$bedroom_ratio = housing$total_bedrooms / housing$total_rooms

Map Visualizations

##Set as sf feature
housing = st_as_sf(housing,
                   coords = c("longitude", "latitude"),
                   crs = 4326)

##Initial Map

tm_shape(ca_sf) +
  tm_borders(col = "black") +
  tm_fill(col = "papayawhip") +
  tm_shape(housing) +
  tm_symbols("median_value_scaled",
             palette = "GnBu",
             alpha = 0.7,
             size = 0.2,
             border.lwd = NA,
             title.col = "Median House Value ($100,000)") +
  tm_layout(main.title = "CA Housing Data")

Linear Model Build

##Check the column names
names(housing)

##  [1] "housing_median_age"  "total_rooms"         "total_bedrooms"     
##  [4] "population"          "households"          "median_income"      
##  [7] "median_house_value"  "ocean_proximity"     "median_value_scaled"
## [10] "avg_rooms"           "bedroom_ratio"       "geometry"

## Set the geometry to NULL
## filter to remove districts above $500k
## select() to choose the columns we want (house value, avg rooms, bedroom ratio, housing median age)

housing2 = housing %>%
  st_set_geometry(NULL) %>%
  dplyr::filter(median_house_value <= 500000) %>%
  dplyr::select(median_house_value, avg_rooms, bedroom_ratio, housing_median_age)

## Check to ensure the dataframe is correct
names(housing2)

## [1] "median_house_value" "avg_rooms"          "bedroom_ratio"     
## [4] "housing_median_age"

## Build a basic OLS linear model using lm
## Median House Value as a function of all of the variables in housing2

fit1 = lm(median_house_value ~ ., 
          data = housing2)

summary(fit1)

## 
## Call:
## lm(formula = median_house_value ~ ., data = housing2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -212406  -72699  -15363   54732  378208 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         240435.33    4267.72  56.338  < 2e-16 ***
## avg_rooms             1520.71     329.30   4.618  3.9e-06 ***
## bedroom_ratio      -373380.48   13247.06 -28.186  < 2e-16 ***
## housing_median_age     850.00      55.41  15.340  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 94820 on 19471 degrees of freedom
## Multiple R-squared:  0.05836,    Adjusted R-squared:  0.05822 
## F-statistic: 402.3 on 3 and 19471 DF,  p-value: < 2.2e-16

mlr3 Package

Define a task: this describes the dataset, the response variable as well as any transformations of the data that are required
Select a learner: this one of a set of machine learning algorithms that will be used to build the model
Set up the training and testing datasets to calibrate and validate the model, respectively
Define a performance measure to assess the skill of the model

Tasks

Note: mlr3 defines two basic tasks: regression (for continuous variables) and classification (for categorical or binary variable). We’ll use the first of these with the housing data, as the values are continuous. A regression task can be created using the TaskRegr() function. In this function, we specify

A label to describe the task
A backend defining the data source (here the housing2 data frame). Note that is quite flexible and can also include SQL databases and cloud data APIs
A target defining the response variable (i.e. the thing we want to model)

## Regression Task

task_housing = TaskRegr$new(id = "housing",
                            backend = housing2,
                            target = "median_house_value")

## Check
print(task_housing)

## <TaskRegr:housing> (19475 x 4)
## * Target: median_house_value
## * Properties: -
## * Features (3):
##   - dbl (3): avg_rooms, bedroom_ratio, housing_median_age

## Tasks have a series of attributes that allow you to investigate the characteristics of the data:

## Number of observations
task_housing$nrow

## Number of features
task_housing$ncol

# See all data
task_housing$data()

# retrieve data for rows with ids 1, 51, and 101
task_housing$data(rows = c(1, 51, 101))

## Names of features (independent variables)
task_housing$feature_names

## Name of target variable (dependent variable)
task_housing$target_names

Learners

Note: The base mlr3 package only comes with a few machine learning algorithms or learners. Many more are available through the mlr3learners package.

## Display a list of learners

mlr_learners

## <DictionaryLearner> with 27 stored values
## Keys: classif.cv_glmnet, classif.debug, classif.featureless,
##   classif.glmnet, classif.kknn, classif.lda, classif.log_reg,
##   classif.multinom, classif.naive_bayes, classif.nnet, classif.qda,
##   classif.ranger, classif.rpart, classif.svm, classif.xgboost,
##   regr.cv_glmnet, regr.debug, regr.featureless, regr.glmnet, regr.kknn,
##   regr.km, regr.lm, regr.nnet, regr.ranger, regr.rpart, regr.svm,
##   regr.xgboost

Note: This package does not contain the algorithms, but acts to link a diverse range of R packages that include the machine learning methods. To get a better idea of how this works, let’s select a classification algorithm (a classification tree):

## Classification algorithm example

learner = mlr_learners$get("classif.rpart")

## View the function
print(learner)

## <LearnerClassifRpart:classif.rpart>: Classification Tree
## * Model: -
## * Parameters: xval=0
## * Packages: mlr3, rpart
## * Predict Types:  [response], prob
## * Feature Types: logical, integer, numeric, factor, ordered
## * Properties: importance, missings, multiclass, selected_features,
##   twoclass, weights

Note: The output of the print() function describes this particular algorithm. You should see that the functions used are from the rpart package, as well as information about the type of predictions that can be made and the type of features that can be included. This also lists the current set of parameters (or hyper-parameters) for that particular algorithm. When a learner is created, these are set to default values, and you can see the full list of these by typing:

## Parameter Set

learner$param_set

## <ParamSet>
##                 id    class lower upper nlevels        default value
##  1:             cp ParamDbl     0     1     Inf           0.01      
##  2:     keep_model ParamLgl    NA    NA       2          FALSE      
##  3:     maxcompete ParamInt     0   Inf     Inf              4      
##  4:       maxdepth ParamInt     1    30      30             30      
##  5:   maxsurrogate ParamInt     0   Inf     Inf              5      
##  6:      minbucket ParamInt     1   Inf     Inf <NoDefault[3]>      
##  7:       minsplit ParamInt     1   Inf     Inf             20      
##  8: surrogatestyle ParamInt     0     1       2              0      
##  9:   usesurrogate ParamInt     0     2       3              2      
## 10:           xval ParamInt     0   Inf     Inf             10     0

## Create a learner for OLS regression (regr.lm)

learner = mlr_learners$get("regr.lm")

## Check
print(learner)

## <LearnerRegrLM:regr.lm>
## * Model: -
## * Parameters: list()
## * Packages: mlr3, mlr3learners, stats
## * Predict Types:  [response], se
## * Feature Types: logical, integer, numeric, character, factor
## * Properties: loglik, weights

## Check parameter set

learner$param_set

## <ParamSet>
##              id    class lower upper nlevels        default value
##  1:          df ParamDbl  -Inf   Inf     Inf            Inf      
##  2:    interval ParamFct    NA    NA       3 <NoDefault[3]>      
##  3:       level ParamDbl  -Inf   Inf     Inf           0.95      
##  4:       model ParamLgl    NA    NA       2           TRUE      
##  5:      offset ParamLgl    NA    NA       2 <NoDefault[3]>      
##  6:    pred.var ParamUty    NA    NA     Inf <NoDefault[3]>      
##  7:          qr ParamLgl    NA    NA       2           TRUE      
##  8:       scale ParamDbl  -Inf   Inf     Inf                     
##  9: singular.ok ParamLgl    NA    NA       2           TRUE      
## 10:           x ParamLgl    NA    NA       2          FALSE      
## 11:           y ParamLgl    NA    NA       2          FALSE

**Training and Testing Data

Next we’ll create a training and testing dataset. For this first iteration of our model, we’ll split the data manually into two sections, with 80% of the observations in the training set and 20% in the testing. In the following code, we:

Set the random seed to allow for reproducibility (this is optional)
Create a set of indices for the training data using the sample() function. The first argument task_housing nrow gives the range of numbers to randomly sample (between 1 and the number of observations). The second argument 0.8 * task_housing nrow gives the number of random samples to take
Create a set of indices for the testing data as the indices not used in the training set

The set.seed() function just re-initializes R’s random number generator to the same value. This should ensure that you always get the same random split. You can skip this line if you’d like to see how much the results might vary if you have a different split into training and testing datasets.

##Set seed for reproducability

set.seed(1900)

## Training set (80% of the data)
train_set = sample(task_housing$nrow,
                   0.8 * task_housing$nrow)

## Test set (the remaining 20% of the data)
## The elements of setdiff(x,y) are those elements in x but not in y. The definition is taken to match the version in the base package.
## seq_len sets the indices for use??
test_set = setdiff(seq_len(task_housing$nrow), train_set)

## Check the length of the observations for both the training and test sets

length(train_set)

## [1] 15580

length(test_set)

## [1] 3895

Note: The learner that we just created has a variable (model) that contains the model results. At the moment, this is empty:

##Verify the model learner is empty (NULL Response)
learner$model

## NULL

##Train the model
learner$train(task_housing,
              row_ids = train_set)

##Check the model learner now has coefficients 
## **Will differ from our original lm because we are only using the training data**

learner$model

## 
## Call:
## stats::lm(formula = task$formula(), data = task$data())
## 
## Coefficients:
##        (Intercept)           avg_rooms       bedroom_ratio  housing_median_age  
##             243828                1396             -385504                 847

Prediction

Note: Prediction in mlr3 is fairly straightforward. We use our now-trained learner and the predict() method. We specify new data by using the testing set indices created above.

## Prediction using our test set
predict_val = learner$predict(task = task_housing,
                              row_ids = test_set)

## Print the first few rows of data
print(predict_val)

## <PredictionRegr> for 3895 observations:
##     row_ids  truth response
##           3 352100 249511.4
##           4 341300 224882.8
##           6 269700 205169.3
## ---                        
##       19451  57500 193545.0
##       19466 112000 187682.7
##       19473  92300 182542.4

Note: The print() function displays the first few rows of these data. Note that one column holds the truth - the observed value, and one holds the predicted value (response). The mlr3viz package contains functions for visualizing various aspects of your model. Here, we use the autoplot() function to display the predicted values of the test set against the truth:

##Use the mlr3viz tool's autoplot() to plot the predicted vs observed values (truth)

autoplot(predict_val)

Note:

Each point represents one observation from the test set. The x-axis is the predicted value, and the y -axis the observed value. The spread of the cloud gives us some indication about the predictive skill of the learner; wider suggests a poorer performance.

The thin black line is the 1:1 line. If the model provides an unbiased prediction, then the points should lie along this line. The blue line is a linear model fit to the points - if this matches the thin line, this is also evidence for low bias. The difference here suggests that the model may slightly over-estimate at higher house values. One thing to note here is that there are several districts where the house prices are predicted to be negative. This is obviously incorrect (unless they are paying people to take a house). Transforming the outcome variable (e.g. log transformation) would help avoid this problem.

Performance Measures

Note: This plot allows us to visualize how well the model has predicted for the test data, but we also need to quantify this using a performance measure. This will eventually allow us to compare different learning algorithms or different setups of the same algorithm to see which is best. Not too surprisingly then, mlr3 comes with a whole suite of different measures that we can use. To see the full list, type:

## Check the list of performance measures 

mlr_measures

## <DictionaryMeasure> with 62 stored values
## Keys: aic, bic, classif.acc, classif.auc, classif.bacc, classif.bbrier,
##   classif.ce, classif.costs, classif.dor, classif.fbeta, classif.fdr,
##   classif.fn, classif.fnr, classif.fomr, classif.fp, classif.fpr,
##   classif.logloss, classif.mauc_au1p, classif.mauc_au1u,
##   classif.mauc_aunp, classif.mauc_aunu, classif.mbrier, classif.mcc,
##   classif.npv, classif.ppv, classif.prauc, classif.precision,
##   classif.recall, classif.sensitivity, classif.specificity, classif.tn,
##   classif.tnr, classif.tp, classif.tpr, debug, oob_error, regr.bias,
##   regr.ktau, regr.mae, regr.mape, regr.maxae, regr.medae, regr.medse,
##   regr.mse, regr.msle, regr.pbias, regr.rae, regr.rmse, regr.rmsle,
##   regr.rrse, regr.rse, regr.rsq, regr.sae, regr.smape, regr.srho,
##   regr.sse, selected_features, sim.jaccard, sim.phi, time_both,
##   time_predict, time_train

## Calculate RMSE

## Create the measure variable
measure = msr("regr.rmse")

## Display the score
predict_val$score(measure)

## regr.rmse 
##  93252.98

## Calculate Mean Absolute Error

## Create the measure variable
measure = msr("regr.mae")

## Display the score
predict_val$score(measure)

## regr.mae 
## 74691.43

## Calculate the bias

## Create the measure variable
measure = msr("regr.bias")

## Display the score
predict_val$score(measure)

## regr.bias 
## -156.4089

Note: Note that we can also use these measure to assess the calibration (the goodness-of-fit). For this, we run a second prediction for the training dataset, and calculate the RMSE.

## Calculate the calibration of the model

## Second prediction for the training dataset
predict_cal = learner$predict(task_housing, 
                              row_ids = train_set)

## Create the measure variable
measure = msr("regr.rmse")

## Display the score
predict_cal$score(measure)

## regr.rmse 
##  95193.33

Interpretation: The RMSE is a good measure of the average error. It’s worth noting here that this suggests our error is pretty large, over $90,000 dollars relative the actual price.

Resampling

Note: So far, we have built and tested our model on a single split of the data (the hold-out method). However, if the training and testing datasets are not well set up, the estimates of model performance can be biased. There are several more exhaustive resampling strategies that can be used instead, and we will implement one of these now. To see the list of available strategies, type:

## List of available resampling methods

mlr_resamplings

## <DictionaryResampling> with 9 stored values
## Keys: bootstrap, custom, custom_cv, cv, holdout, insample, loo,
##   repeated_cv, subsampling

## Create our resampling variable which selects the method (k-fold "cv")
## 5 folds (the number times we split the data)
resampling = rsmp("cv", 
                  folds = 5)

## Alternative hold out approach
## rsmp("holdout", ratio = 0.8)

## Check
print(resampling)

## <ResamplingCV>: Cross-Validation
## * Iterations: 5
## * Instantiated: FALSE
## * Parameters: folds=5

Note: Instantiated set to false means the resampler hasn’t been run yet.

We now run the resampling strategy. To do this, we need to provide a task, so that the dataset can be divided up appropriately. This is carried out by calling the $instantiate() method, and the resulting indices for training and testing for the different folds are stored in the resampling object:

## Run the resample
resampling$instantiate(task_housing)

## Check (responds with the number of iterations?)
resampling$iters

## [1] 5

## Check one of the training sets
head(resampling$train_set(1))

## [1]  2  7  8 12 14 15

## Check one of the test sets 
head(resampling$test_set(1))

## [1]  3  5 13 26 40 46

## Dimensions check
str(resampling$train_set(3))

##  int [1:15580] 3 5 13 26 40 46 48 49 50 53 ...

## Check resampling
print(resampling)

## <ResamplingCV>: Cross-Validation
## * Iterations: 5
## * Instantiated: TRUE
## * Parameters: folds=5

##Instatiated TRUE means it ran

Note: Now with a task, a learner and a resampling object, we can call resample(), which calibrates the model using the training sets from the resampling strategy, and predicts for the test sets. The argument store_models = TRUE tells the function to save each individual model as it is built.

## Re-run the model
rr = mlr3::resample(task = task_housing,
              learner = learner,
              resampling = resampling,
              store_models = TRUE) ## Saves each individual Model

## INFO  [09:55:31.964] [mlr3] Applying learner 'regr.lm' on task 'housing' (iter 1/5)
## INFO  [09:55:31.993] [mlr3] Applying learner 'regr.lm' on task 'housing' (iter 2/5)
## INFO  [09:55:32.007] [mlr3] Applying learner 'regr.lm' on task 'housing' (iter 3/5)
## INFO  [09:55:32.021] [mlr3] Applying learner 'regr.lm' on task 'housing' (iter 4/5)
## INFO  [09:55:32.029] [mlr3] Applying learner 'regr.lm' on task 'housing' (iter 5/5)

##check the output

print(rr)

## <ResampleResult> of 5 iterations
## * Task: housing
## * Learner: regr.lm
## * Warnings: 0 in 0 iterations
## * Errors: 0 in 0 iterations

##Errors and warnings can be examined with
## rr$errors
## rr$warnings

## Set RMSE
measure = msr("regr.rmse")

## See each individual fold's score
## Alternative - rr$score(msr("regr.rmse"))
rr$score(measure)

##              task task_id             learner learner_id         resampling
## 1: <TaskRegr[47]> housing <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
## 2: <TaskRegr[47]> housing <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
## 3: <TaskRegr[47]> housing <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
## 4: <TaskRegr[47]> housing <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
## 5: <TaskRegr[47]> housing <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##    resampling_id iteration           prediction regr.rmse
## 1:            cv         1 <PredictionRegr[19]>  93385.70
## 2:            cv         2 <PredictionRegr[19]>  96691.02
## 3:            cv         3 <PredictionRegr[19]>  94425.22
## 4:            cv         4 <PredictionRegr[19]>  95344.56
## 5:            cv         5 <PredictionRegr[19]>  94338.20

## Check the aggregate (average?) RMSE

rr$aggregate(measure)

## regr.rmse 
##  94836.94

Note: All of the models are housed in the object $learners

## All models
rr$learners

## [[1]]
## <LearnerRegrLM:regr.lm>
## * Model: lm
## * Parameters: list()
## * Packages: mlr3, mlr3learners, stats
## * Predict Types:  [response], se
## * Feature Types: logical, integer, numeric, character, factor
## * Properties: loglik, weights
## 
## [[2]]
## <LearnerRegrLM:regr.lm>
## * Model: lm
## * Parameters: list()
## * Packages: mlr3, mlr3learners, stats
## * Predict Types:  [response], se
## * Feature Types: logical, integer, numeric, character, factor
## * Properties: loglik, weights
## 
## [[3]]
## <LearnerRegrLM:regr.lm>
## * Model: lm
## * Parameters: list()
## * Packages: mlr3, mlr3learners, stats
## * Predict Types:  [response], se
## * Feature Types: logical, integer, numeric, character, factor
## * Properties: loglik, weights
## 
## [[4]]
## <LearnerRegrLM:regr.lm>
## * Model: lm
## * Parameters: list()
## * Packages: mlr3, mlr3learners, stats
## * Predict Types:  [response], se
## * Feature Types: logical, integer, numeric, character, factor
## * Properties: loglik, weights
## 
## [[5]]
## <LearnerRegrLM:regr.lm>
## * Model: lm
## * Parameters: list()
## * Packages: mlr3, mlr3learners, stats
## * Predict Types:  [response], se
## * Feature Types: logical, integer, numeric, character, factor
## * Properties: loglik, weights

## Model 1
rr$learners[[1]]

## <LearnerRegrLM:regr.lm>
## * Model: lm
## * Parameters: list()
## * Packages: mlr3, mlr3learners, stats
## * Predict Types:  [response], se
## * Feature Types: logical, integer, numeric, character, factor
## * Properties: loglik, weights

Note: And we can loop through all models quite simply and print out the coefficients for each one to see how much these vary across the different folds of data:

## Loop through each model to extract coefficients

for (i in 1:5) {
  lrn = rr$learners[[i]] ##temp variable lrn stores the model
  print(coef(lrn$model)) ##prints the $model coefficients per model
}

##        (Intercept)          avg_rooms      bedroom_ratio housing_median_age 
##        237369.7105          1895.7208       -367790.2646           857.7041 
##        (Intercept)          avg_rooms      bedroom_ratio housing_median_age 
##        243895.2345          1281.0491       -389496.4919           874.4974 
##        (Intercept)          avg_rooms      bedroom_ratio housing_median_age 
##        242482.0511          1364.3332       -375242.8442           818.0104 
##        (Intercept)          avg_rooms      bedroom_ratio housing_median_age 
##        234507.2801          1870.9016       -360713.1893           907.7433 
##        (Intercept)          avg_rooms      bedroom_ratio housing_median_age 
##        242751.3943          1322.1513       -371765.7135           794.4392

Changing Model

This section is just designed to show how easy it is to switch out the method you are using to try a different approach. Here, we’ll try using a penalized linear model, the Lasso. To test this, we simply need to define a new learner using the appropriate function (here: regr.glmnet). One small addition here is that we set a couple of model parameters (we’ll look more in detail at model parameters in the next lab):

alpha: this is the mixing ratio. alpha = 1 provides the Lasso algorithm (= 0 results in ridge regression) lambda: this the penalty weight

## Note: received warning - "Warning: Package 'glmnet' required but not installed for Learner 'regr.glmnet'"

#install.packages("glmnet")
library(glmnet)

## Loading required package: Matrix

## Loaded glmnet 4.1-6

## Check the learners
mlr_learners

## <DictionaryLearner> with 27 stored values
## Keys: classif.cv_glmnet, classif.debug, classif.featureless,
##   classif.glmnet, classif.kknn, classif.lda, classif.log_reg,
##   classif.multinom, classif.naive_bayes, classif.nnet, classif.qda,
##   classif.ranger, classif.rpart, classif.svm, classif.xgboost,
##   regr.cv_glmnet, regr.debug, regr.featureless, regr.glmnet, regr.kknn,
##   regr.km, regr.lm, regr.nnet, regr.ranger, regr.rpart, regr.svm,
##   regr.xgboost

## Define a new learner
learner2 = mlr_learners$get("regr.glmnet")

## Check
print(learner2)

## <LearnerRegrGlmnet:regr.glmnet>
## * Model: -
## * Parameters: family=gaussian
## * Packages: mlr3, mlr3learners, glmnet
## * Predict Types:  [response]
## * Feature Types: logical, integer, numeric
## * Properties: weights

## Check parameter set
learner2$param_set

## <ParamSet>
##                       id    class lower upper nlevels        default parents
##  1:            alignment ParamFct    NA    NA       2         lambda        
##  2:                alpha ParamDbl     0     1     Inf              1        
##  3:                  big ParamDbl  -Inf   Inf     Inf        9.9e+35        
##  4:               devmax ParamDbl     0     1     Inf          0.999        
##  5:                dfmax ParamInt     0   Inf     Inf <NoDefault[3]>        
##  6:                  eps ParamDbl     0     1     Inf          1e-06        
##  7:                epsnr ParamDbl     0     1     Inf          1e-08        
##  8:                exact ParamLgl    NA    NA       2          FALSE        
##  9:              exclude ParamInt     1   Inf     Inf <NoDefault[3]>        
## 10:                 exmx ParamDbl  -Inf   Inf     Inf            250        
## 11:               family ParamFct    NA    NA       2       gaussian        
## 12:                 fdev ParamDbl     0     1     Inf          1e-05        
## 13:                gamma ParamDbl  -Inf   Inf     Inf              1   relax
## 14:              grouped ParamLgl    NA    NA       2           TRUE        
## 15:            intercept ParamLgl    NA    NA       2           TRUE        
## 16:                 keep ParamLgl    NA    NA       2          FALSE        
## 17:               lambda ParamUty    NA    NA     Inf <NoDefault[3]>        
## 18:     lambda.min.ratio ParamDbl     0     1     Inf <NoDefault[3]>        
## 19:         lower.limits ParamUty    NA    NA     Inf <NoDefault[3]>        
## 20:                maxit ParamInt     1   Inf     Inf         100000        
## 21:                mnlam ParamInt     1   Inf     Inf              5        
## 22:                 mxit ParamInt     1   Inf     Inf            100        
## 23:               mxitnr ParamInt     1   Inf     Inf             25        
## 24:            newoffset ParamUty    NA    NA     Inf <NoDefault[3]>        
## 25:              nlambda ParamInt     1   Inf     Inf            100        
## 26:               offset ParamUty    NA    NA     Inf                       
## 27:             parallel ParamLgl    NA    NA       2          FALSE        
## 28:       penalty.factor ParamUty    NA    NA     Inf <NoDefault[3]>        
## 29:                 pmax ParamInt     0   Inf     Inf <NoDefault[3]>        
## 30:                 pmin ParamDbl     0     1     Inf          1e-09        
## 31:                 prec ParamDbl  -Inf   Inf     Inf          1e-10        
## 32:                relax ParamLgl    NA    NA       2          FALSE        
## 33:                    s ParamDbl     0   Inf     Inf           0.01        
## 34:          standardize ParamLgl    NA    NA       2           TRUE        
## 35: standardize.response ParamLgl    NA    NA       2          FALSE        
## 36:               thresh ParamDbl     0   Inf     Inf          1e-07        
## 37:             trace.it ParamInt     0     1       2              0        
## 38:        type.gaussian ParamFct    NA    NA       2 <NoDefault[3]>  family
## 39:        type.logistic ParamFct    NA    NA       2 <NoDefault[3]>        
## 40:     type.multinomial ParamFct    NA    NA       2 <NoDefault[3]>        
## 41:         upper.limits ParamUty    NA    NA     Inf <NoDefault[3]>        
##                       id    class lower upper nlevels        default parents
##        value
##  1:         
##  2:         
##  3:         
##  4:         
##  5:         
##  6:         
##  7:         
##  8:         
##  9:         
## 10:         
## 11: gaussian
## 12:         
## 13:         
## 14:         
## 15:         
## 16:         
## 17:         
## 18:         
## 19:         
## 20:         
## 21:         
## 22:         
## 23:         
## 24:         
## 25:         
## 26:         
## 27:         
## 28:         
## 29:         
## 30:         
## 31:         
## 32:         
## 33:         
## 34:         
## 35:         
## 36:         
## 37:         
## 38:         
## 39:         
## 40:         
## 41:         
##        value

## Set paramaters
## alpha = mixing ratio (1 = lasso algorithm; 0 = ridge regression)
## lambda = penalty weight

learner2$param_set$values = list(alpha = 1,
                                 lambda = 1e-3)

## Check parameters
learner2$param_set$values

## $alpha
## [1] 1
## 
## $lambda
## [1] 0.001

## Using the previous training set

learner2$train(task_housing,
               row_ids = train_set)

## To cross-validate, we can simply reuse the resampler defined earlier

rr2 = mlr3::resample(task = task_housing,
               learner = learner2,
               resampling = resampling,
               store_models = TRUE)

## INFO  [09:55:32.538] [mlr3] Applying learner 'regr.glmnet' on task 'housing' (iter 1/5)
## INFO  [09:55:32.566] [mlr3] Applying learner 'regr.glmnet' on task 'housing' (iter 2/5)
## INFO  [09:55:32.576] [mlr3] Applying learner 'regr.glmnet' on task 'housing' (iter 3/5)
## INFO  [09:55:32.585] [mlr3] Applying learner 'regr.glmnet' on task 'housing' (iter 4/5)
## INFO  [09:55:32.593] [mlr3] Applying learner 'regr.glmnet' on task 'housing' (iter 5/5)

## Check

print(rr2)

## <ResampleResult> of 5 iterations
## * Task: housing
## * Learner: regr.glmnet
## * Warnings: 0 in 0 iterations
## * Errors: 0 in 0 iterations

##No errors or warnings

##RMSEs
rr2$score(measure)

##              task task_id                 learner  learner_id
## 1: <TaskRegr[47]> housing <LearnerRegrGlmnet[37]> regr.glmnet
## 2: <TaskRegr[47]> housing <LearnerRegrGlmnet[37]> regr.glmnet
## 3: <TaskRegr[47]> housing <LearnerRegrGlmnet[37]> regr.glmnet
## 4: <TaskRegr[47]> housing <LearnerRegrGlmnet[37]> regr.glmnet
## 5: <TaskRegr[47]> housing <LearnerRegrGlmnet[37]> regr.glmnet
##            resampling resampling_id iteration           prediction regr.rmse
## 1: <ResamplingCV[20]>            cv         1 <PredictionRegr[19]>  93385.73
## 2: <ResamplingCV[20]>            cv         2 <PredictionRegr[19]>  96691.02
## 3: <ResamplingCV[20]>            cv         3 <PredictionRegr[19]>  94425.22
## 4: <ResamplingCV[20]>            cv         4 <PredictionRegr[19]>  95344.57
## 5: <ResamplingCV[20]>            cv         5 <PredictionRegr[19]>  94338.20

## Check RMSE

rr$aggregate(measure)

## regr.rmse 
##  94836.94

Exercise

Data Setup

## Incorporate more covariates
## - excludes variables in the select feature

## Variable Names
names(housing)

##  [1] "housing_median_age"  "total_rooms"         "total_bedrooms"     
##  [4] "population"          "households"          "median_income"      
##  [7] "median_house_value"  "ocean_proximity"     "median_value_scaled"
## [10] "avg_rooms"           "bedroom_ratio"       "geometry"

## Incorporate more covariates
## - excludes variables in the select feature
## Ensure to take out the scaled median value variable!!!

housing3 = housing %>%
  st_set_geometry(NULL) %>%
  dplyr::filter(median_house_value <= 500000) %>%
  dplyr::select(-total_rooms, -total_bedrooms, -ocean_proximity, -median_value_scaled)

## Check the new dataset
names(housing3)

## [1] "housing_median_age" "population"         "households"        
## [4] "median_income"      "median_house_value" "avg_rooms"         
## [7] "bedroom_ratio"

1. Basic Linear Model

## Build a linear model

fit2 = lm(median_house_value ~ ., 
          data = housing3)

library(jtools) ##Better output table
library(kableExtra)

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

##jtools summ function for table visualization
summ(fit2)

Observations	19475
Dependent variable	median_house_value
Type	OLS linear regression

F(6,19468)	3816.91
R²	0.54
Adj. R²	0.54

	Est.	S.E.	t val.	p
(Intercept)	-158218.46	4234.93	-37.36	0.00
housing_median_age	1712.91	41.22	41.55	0.00
population	-32.17	1.00	-32.24	0.00
households	114.75	3.01	38.15	0.00
median_income	52813.38	393.59	134.18	0.00
avg_rooms	-207.53	232.05	-0.89	0.37
bedroom_ratio	455038.35	11301.13	40.26	0.00
Standard errors: OLS

##Original Summary Table for reference

## Model Summary
summary(fit2)

## 
## Call:
## lm(formula = median_house_value ~ ., data = housing3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -657404  -41785   -9615   31716  647298 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -1.582e+05  4.235e+03 -37.360   <2e-16 ***
## housing_median_age  1.713e+03  4.122e+01  41.551   <2e-16 ***
## population         -3.217e+01  9.980e-01 -32.237   <2e-16 ***
## households          1.147e+02  3.008e+00  38.146   <2e-16 ***
## median_income       5.281e+04  3.936e+02 134.182   <2e-16 ***
## avg_rooms          -2.075e+02  2.320e+02  -0.894    0.371    
## bedroom_ratio       4.550e+05  1.130e+04  40.265   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 66240 on 19468 degrees of freedom
## Multiple R-squared:  0.5405, Adjusted R-squared:  0.5404 
## F-statistic:  3817 on 6 and 19468 DF,  p-value: < 2.2e-16

2. Task Build

## Define the task

task_housing3 = TaskRegr$new(id = "housing3",
                            backend = housing3,
                            target = "median_house_value")

## Check
print(task_housing3)

## <TaskRegr:housing3> (19475 x 7)
## * Target: median_house_value
## * Properties: -
## * Features (6):
##   - dbl (6): avg_rooms, bedroom_ratio, households, housing_median_age,
##     median_income, population

3. Resampling and RMSE

## Create our resampling variable which selects the method (k-fold "cv")
## 10 folds (the number times we split the data)
resampling10 = rsmp("cv", 
                  folds = 10)

## Check
print(resampling10)

## <ResamplingCV>: Cross-Validation
## * Iterations: 10
## * Instantiated: FALSE
## * Parameters: folds=10

## Run the resampling

## Run the resample
resampling10$instantiate(task_housing3)

## Check (responds with the number of iterations?)
resampling10$iters

## [1] 10

## Check that it instantiated
print(resampling10)

## <ResamplingCV>: Cross-Validation
## * Iterations: 10
## * Instantiated: TRUE
## * Parameters: folds=10

## Run the model
rr3 = mlr3::resample(task = task_housing3,
              learner = learner,
              resampling = resampling10,
              store_models = TRUE) ## Saves each individual Model

## INFO  [09:55:32.834] [mlr3] Applying learner 'regr.lm' on task 'housing3' (iter 1/10)
## INFO  [09:55:32.851] [mlr3] Applying learner 'regr.lm' on task 'housing3' (iter 2/10)
## INFO  [09:55:32.862] [mlr3] Applying learner 'regr.lm' on task 'housing3' (iter 3/10)
## INFO  [09:55:32.872] [mlr3] Applying learner 'regr.lm' on task 'housing3' (iter 4/10)
## INFO  [09:55:32.888] [mlr3] Applying learner 'regr.lm' on task 'housing3' (iter 5/10)
## INFO  [09:55:32.909] [mlr3] Applying learner 'regr.lm' on task 'housing3' (iter 6/10)
## INFO  [09:55:32.920] [mlr3] Applying learner 'regr.lm' on task 'housing3' (iter 7/10)
## INFO  [09:55:33.105] [mlr3] Applying learner 'regr.lm' on task 'housing3' (iter 8/10)
## INFO  [09:55:33.116] [mlr3] Applying learner 'regr.lm' on task 'housing3' (iter 9/10)
## INFO  [09:55:33.128] [mlr3] Applying learner 'regr.lm' on task 'housing3' (iter 10/10)

## Set RMSE
measure = msr("regr.rmse")

## See each individual fold's score
## Alternative - rr$score(msr("regr.rmse"))
rr3$score(measure)

##               task  task_id             learner learner_id         resampling
##  1: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  2: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  3: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  4: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  5: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  6: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  7: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  8: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  9: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
## 10: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##     resampling_id iteration           prediction regr.rmse
##  1:            cv         1 <PredictionRegr[19]>  64162.67
##  2:            cv         2 <PredictionRegr[19]>  65150.76
##  3:            cv         3 <PredictionRegr[19]>  69775.55
##  4:            cv         4 <PredictionRegr[19]>  67357.38
##  5:            cv         5 <PredictionRegr[19]>  68175.54
##  6:            cv         6 <PredictionRegr[19]>  64825.18
##  7:            cv         7 <PredictionRegr[19]>  64804.43
##  8:            cv         8 <PredictionRegr[19]>  64900.43
##  9:            cv         9 <PredictionRegr[19]>  64666.02
## 10:            cv        10 <PredictionRegr[19]>  69301.31

## Aggregate RMSE

rr3$aggregate(measure)

## regr.rmse 
##  66311.93

4. Interpretation

## Bias
measure = msr("regr.bias")

## Individual
rr3$score(measure)

##               task  task_id             learner learner_id         resampling
##  1: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  2: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  3: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  4: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  5: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  6: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  7: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  8: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  9: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
## 10: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##     resampling_id iteration           prediction  regr.bias
##  1:            cv         1 <PredictionRegr[19]> -1616.8066
##  2:            cv         2 <PredictionRegr[19]>   590.0109
##  3:            cv         3 <PredictionRegr[19]>  1887.0899
##  4:            cv         4 <PredictionRegr[19]>    17.1150
##  5:            cv         5 <PredictionRegr[19]>  2721.3795
##  6:            cv         6 <PredictionRegr[19]> -1268.7146
##  7:            cv         7 <PredictionRegr[19]> -2159.8490
##  8:            cv         8 <PredictionRegr[19]> -2306.4858
##  9:            cv         9 <PredictionRegr[19]>  -326.6525
## 10:            cv        10 <PredictionRegr[19]>  2524.1929

## Bias
measure = msr("regr.bias")

## Aggregate
rr3$aggregate(measure)

## regr.bias 
##  6.127976

## mean absolute error
rr3$score(msr("regr.mae"))

##               task  task_id             learner learner_id         resampling
##  1: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  2: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  3: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  4: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  5: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  6: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  7: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  8: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##  9: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
## 10: <TaskRegr[47]> housing3 <LearnerRegrLM[37]>    regr.lm <ResamplingCV[20]>
##     resampling_id iteration           prediction regr.mae
##  1:            cv         1 <PredictionRegr[19]> 48776.69
##  2:            cv         2 <PredictionRegr[19]> 48813.07
##  3:            cv         3 <PredictionRegr[19]> 50713.28
##  4:            cv         4 <PredictionRegr[19]> 49270.74
##  5:            cv         5 <PredictionRegr[19]> 50440.96
##  6:            cv         6 <PredictionRegr[19]> 48653.87
##  7:            cv         7 <PredictionRegr[19]> 48253.11
##  8:            cv         8 <PredictionRegr[19]> 48053.36
##  9:            cv         9 <PredictionRegr[19]> 47464.03
## 10:            cv        10 <PredictionRegr[19]> 49881.19

## Aggregate MAE
rr3$aggregate(msr("regr.mae"))

## regr.mae 
## 49032.03

This model is better at predictions than the model we built in the lab (using different resampling methods) as the individual root mean squared error (RMSE) and mean absolute error (MAE) are both significantly less. The aggregate RMSE, MAE, and model bias are also significantly less for this model (rr3) than our original model (rr).

GEOG6160 Lab 6

David Leydet

2023-02-27