Model trees are similar but incorporate linear regression at the
leaf nodes.
Example: Calculating Standard Deviation Reduction (SDR)
# Set up the data - a vector representing sample values
tee <- c(1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 7, 7, 7)
# Defining two potential ways to split the data
at1 <- c(1, 1, 1, 2, 2, 3, 4, 5, 5) # First subset
at2 <- c(6, 6, 7, 7, 7, 7) # Second subset
bt1 <- c(1, 1, 1, 2, 2, 3, 4) # Alternative first subset
bt2 <- c(5, 5, 6, 6, 7, 7, 7, 7) # Alternative second subset
# Compute the Standard Deviation Reduction (SDR) for two possible splits
# SDR measures the decrease in variance after a split, helping to determine the best split.
sdr_a <- sd(tee) - (length(at1) / length(tee) * sd(at1) + length(at2) / length(tee) * sd(at2))
sdr_b <- sd(tee) - (length(bt1) / length(tee) * sd(bt1) + length(bt2) / length(tee) * sd(bt2))
# Compare the SDR for each split
# The split with the higher SDR is preferred as it results in a greater reduction of variance.
sdr_a
[1] 1.202815
sdr_b
[1] 1.392751
Exercise No. 3: Estimating Wine Quality using Regression Trees
Step 2: Exploring and Preparing the Data
# Load the dataset containing white wine characteristics and quality ratings
wine <- read.csv("whitewines.csv")
# Examine the structure of the dataset to understand variable types and missing values
str(wine)
'data.frame': 4898 obs. of 12 variables:
$ fixed.acidity : num 6.7 5.7 5.9 5.3 6.4 7 7.9 6.6 7 6.5 ...
$ volatile.acidity : num 0.62 0.22 0.19 0.47 0.29 0.14 0.12 0.38 0.16 0.37 ...
$ citric.acid : num 0.24 0.2 0.26 0.1 0.21 0.41 0.49 0.28 0.3 0.33 ...
$ residual.sugar : num 1.1 16 7.4 1.3 9.65 0.9 5.2 2.8 2.6 3.9 ...
$ chlorides : num 0.039 0.044 0.034 0.036 0.041 0.037 0.049 0.043 0.043 0.027 ...
$ free.sulfur.dioxide : num 6 41 33 11 36 22 33 17 34 40 ...
$ total.sulfur.dioxide: num 62 113 123 74 119 95 152 67 90 130 ...
$ density : num 0.993 0.999 0.995 0.991 0.993 ...
$ pH : num 3.41 3.22 3.49 3.48 2.99 3.25 3.18 3.21 2.88 3.28 ...
$ sulphates : num 0.32 0.46 0.42 0.54 0.34 0.43 0.47 0.47 0.47 0.39 ...
$ alcohol : num 10.4 8.9 10.1 11.2 10.9 ...
$ quality : int 5 6 6 4 6 6 6 6 6 7 ...
# Visualize the distribution of wine quality ratings
# This helps in understanding how balanced the dataset is.
hist(wine$quality, main = "Distribution of Wine Quality", xlab = "Quality Rating", col = "lightblue")

# Summary statistics of the dataset
# Provides insights into the range, mean, and spread of each feature
summary(wine)
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600 Min. :0.00900
1st Qu.: 6.300 1st Qu.:0.2100 1st Qu.:0.2700 1st Qu.: 1.700 1st Qu.:0.03600
Median : 6.800 Median :0.2600 Median :0.3200 Median : 5.200 Median :0.04300
Mean : 6.855 Mean :0.2782 Mean :0.3342 Mean : 6.391 Mean :0.04577
3rd Qu.: 7.300 3rd Qu.:0.3200 3rd Qu.:0.3900 3rd Qu.: 9.900 3rd Qu.:0.05000
Max. :14.200 Max. :1.1000 Max. :1.6600 Max. :65.800 Max. :0.34600
free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
Min. : 2.00 Min. : 9.0 Min. :0.9871 Min. :2.720 Min. :0.2200
1st Qu.: 23.00 1st Qu.:108.0 1st Qu.:0.9917 1st Qu.:3.090 1st Qu.:0.4100
Median : 34.00 Median :134.0 Median :0.9937 Median :3.180 Median :0.4700
Mean : 35.31 Mean :138.4 Mean :0.9940 Mean :3.188 Mean :0.4898
3rd Qu.: 46.00 3rd Qu.:167.0 3rd Qu.:0.9961 3rd Qu.:3.280 3rd Qu.:0.5500
Max. :289.00 Max. :440.0 Max. :1.0390 Max. :3.820 Max. :1.0800
alcohol quality
Min. : 8.00 Min. :3.000
1st Qu.: 9.50 1st Qu.:5.000
Median :10.40 Median :6.000
Mean :10.51 Mean :5.878
3rd Qu.:11.40 3rd Qu.:6.000
Max. :14.20 Max. :9.000
Step 3: Splitting Data into Training and Testing Sets
# Load the necessary library for data partitioning
library(caret)
# Set seed for reproducibility
set.seed(123)
# Split data into 80% training and 20% testing
trainIndex <- createDataPartition(wine$quality, p = 0.8, list = FALSE)
wine_train <- wine[trainIndex, ]
wine_test <- wine[-trainIndex, ]
Step 4: Training the Regression Tree Model
# Load necessary library for regression trees
library(rpart)
# Train a regression tree model to predict wine quality
wine_model <- rpart(quality ~ ., data = wine_train, method = "anova")
# Display the structure of the tree
print(wine_model)
n= 3919
node), split, n, deviance, yval
* denotes terminal node
1) root 3919 3101.91200 5.879306
2) alcohol< 10.85 2468 1471.87800 5.600891
4) volatile.acidity>=0.2525 1296 631.87650 5.358025 *
5) volatile.acidity< 0.2525 1172 679.02650 5.869454 *
3) alcohol>=10.85 1451 1113.33600 6.352860
6) free.sulfur.dioxide< 11.5 85 89.41176 5.352941 *
7) free.sulfur.dioxide>=11.5 1366 933.64930 6.415081
14) alcohol< 11.74167 664 439.33580 6.218373 *
15) alcohol>=11.74167 702 444.31910 6.601140 *
# Visualize the regression tree structure
library(rpart.plot)
rpart.plot(wine_model, main = "Regression Tree for Wine Quality")

Step 5: Evaluating the Model
# Make predictions on the test set
wine_pred <- predict(wine_model, wine_test)
# Calculate Root Mean Squared Error (RMSE) to assess model performance
rmse <- sqrt(mean((wine_test$quality - wine_pred)^2))
print(paste("Root Mean Squared Error (RMSE):", round(rmse, 2)))
[1] "Root Mean Squared Error (RMSE): 0.76"
Step 6: Improving the Model (Optional)
# Prune the tree to reduce overfitting
pruned_model <- prune(wine_model, cp = 0.01) # Adjust complexity parameter as needed
rpart.plot(pruned_model, main = "Pruned Regression Tree")

# Evaluate the pruned model
wine_pred_pruned <- predict(pruned_model, wine_test)
rmse_pruned <- sqrt(mean((wine_test$quality - wine_pred_pruned)^2))
print(paste("Pruned Model RMSE:", round(rmse_pruned, 2)))
[1] "Pruned Model RMSE: 0.76"
LS0tCnRpdGxlOiAiUmVncmVzc2lvbiBUcmVlcyIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKIyMjIyBSZWdyZXNzaW9uIFRyZWVzIGFuZCBNb2RlbCBUcmVlcwoKIyMgVW5kZXJzdGFuZGluZyBSZWdyZXNzaW9uIFRyZWVzIGFuZCBNb2RlbCBUcmVlcwoKIyBSZWdyZXNzaW9uIHRyZWVzIGFyZSBhIHR5cGUgb2YgZGVjaXNpb24gdHJlZSB1c2VkIGZvciBwcmVkaWN0aW5nIGNvbnRpbnVvdXMgdmFsdWVzLgojIE1vZGVsIHRyZWVzIGFyZSBzaW1pbGFyIGJ1dCBpbmNvcnBvcmF0ZSBsaW5lYXIgcmVncmVzc2lvbiBhdCB0aGUgbGVhZiBub2Rlcy4KCiMjIEV4YW1wbGU6IENhbGN1bGF0aW5nIFN0YW5kYXJkIERldmlhdGlvbiBSZWR1Y3Rpb24gKFNEUikKCmBgYHtyfQojIFNldCB1cCB0aGUgZGF0YSAtIGEgdmVjdG9yIHJlcHJlc2VudGluZyBzYW1wbGUgdmFsdWVzCnRlZSA8LSBjKDEsIDEsIDEsIDIsIDIsIDMsIDQsIDUsIDUsIDYsIDYsIDcsIDcsIDcsIDcpCgojIERlZmluaW5nIHR3byBwb3RlbnRpYWwgd2F5cyB0byBzcGxpdCB0aGUgZGF0YQphdDEgPC0gYygxLCAxLCAxLCAyLCAyLCAzLCA0LCA1LCA1KSAgIyBGaXJzdCBzdWJzZXQKYXQyIDwtIGMoNiwgNiwgNywgNywgNywgNykgICAgICAgICAgIyBTZWNvbmQgc3Vic2V0CmJ0MSA8LSBjKDEsIDEsIDEsIDIsIDIsIDMsIDQpICAgICAgICMgQWx0ZXJuYXRpdmUgZmlyc3Qgc3Vic2V0CmJ0MiA8LSBjKDUsIDUsIDYsIDYsIDcsIDcsIDcsIDcpICAgICMgQWx0ZXJuYXRpdmUgc2Vjb25kIHN1YnNldApgYGAKCmBgYHtyfQojIENvbXB1dGUgdGhlIFN0YW5kYXJkIERldmlhdGlvbiBSZWR1Y3Rpb24gKFNEUikgZm9yIHR3byBwb3NzaWJsZSBzcGxpdHMKIyBTRFIgbWVhc3VyZXMgdGhlIGRlY3JlYXNlIGluIHZhcmlhbmNlIGFmdGVyIGEgc3BsaXQsIGhlbHBpbmcgdG8gZGV0ZXJtaW5lIHRoZSBiZXN0IHNwbGl0LgoKc2RyX2EgPC0gc2QodGVlKSAtIChsZW5ndGgoYXQxKSAvIGxlbmd0aCh0ZWUpICogc2QoYXQxKSArIGxlbmd0aChhdDIpIC8gbGVuZ3RoKHRlZSkgKiBzZChhdDIpKQpzZHJfYiA8LSBzZCh0ZWUpIC0gKGxlbmd0aChidDEpIC8gbGVuZ3RoKHRlZSkgKiBzZChidDEpICsgbGVuZ3RoKGJ0MikgLyBsZW5ndGgodGVlKSAqIHNkKGJ0MikpCmBgYAoKYGBge3J9CiMgQ29tcGFyZSB0aGUgU0RSIGZvciBlYWNoIHNwbGl0CiMgVGhlIHNwbGl0IHdpdGggdGhlIGhpZ2hlciBTRFIgaXMgcHJlZmVycmVkIGFzIGl0IHJlc3VsdHMgaW4gYSBncmVhdGVyIHJlZHVjdGlvbiBvZiB2YXJpYW5jZS4Kc2RyX2EKc2RyX2IKYGBgCgojIyBFeGVyY2lzZSBOby4gMzogRXN0aW1hdGluZyBXaW5lIFF1YWxpdHkgdXNpbmcgUmVncmVzc2lvbiBUcmVlcwoKIyMgU3RlcCAyOiBFeHBsb3JpbmcgYW5kIFByZXBhcmluZyB0aGUgRGF0YQoKYGBge3J9CiMgTG9hZCB0aGUgZGF0YXNldCBjb250YWluaW5nIHdoaXRlIHdpbmUgY2hhcmFjdGVyaXN0aWNzIGFuZCBxdWFsaXR5IHJhdGluZ3MKd2luZSA8LSByZWFkLmNzdigid2hpdGV3aW5lcy5jc3YiKQpgYGAKCmBgYHtyfQojIEV4YW1pbmUgdGhlIHN0cnVjdHVyZSBvZiB0aGUgZGF0YXNldCB0byB1bmRlcnN0YW5kIHZhcmlhYmxlIHR5cGVzIGFuZCBtaXNzaW5nIHZhbHVlcwpzdHIod2luZSkKYGBgCgpgYGB7cn0KIyBWaXN1YWxpemUgdGhlIGRpc3RyaWJ1dGlvbiBvZiB3aW5lIHF1YWxpdHkgcmF0aW5ncwojIFRoaXMgaGVscHMgaW4gdW5kZXJzdGFuZGluZyBob3cgYmFsYW5jZWQgdGhlIGRhdGFzZXQgaXMuCmhpc3Qod2luZSRxdWFsaXR5LCBtYWluID0gIkRpc3RyaWJ1dGlvbiBvZiBXaW5lIFF1YWxpdHkiLCB4bGFiID0gIlF1YWxpdHkgUmF0aW5nIiwgY29sID0gImxpZ2h0Ymx1ZSIpCmBgYAoKYGBge3J9CiMgU3VtbWFyeSBzdGF0aXN0aWNzIG9mIHRoZSBkYXRhc2V0CiMgUHJvdmlkZXMgaW5zaWdodHMgaW50byB0aGUgcmFuZ2UsIG1lYW4sIGFuZCBzcHJlYWQgb2YgZWFjaCBmZWF0dXJlCnN1bW1hcnkod2luZSkKYGBgCgojIyBTdGVwIDM6IFNwbGl0dGluZyBEYXRhIGludG8gVHJhaW5pbmcgYW5kIFRlc3RpbmcgU2V0cwpgYGB7cn0KIyBMb2FkIHRoZSBuZWNlc3NhcnkgbGlicmFyeSBmb3IgZGF0YSBwYXJ0aXRpb25pbmcKbGlicmFyeShjYXJldCkKCiMgU2V0IHNlZWQgZm9yIHJlcHJvZHVjaWJpbGl0eQpzZXQuc2VlZCgxMjMpCgojIFNwbGl0IGRhdGEgaW50byA4MCUgdHJhaW5pbmcgYW5kIDIwJSB0ZXN0aW5nCnRyYWluSW5kZXggPC0gY3JlYXRlRGF0YVBhcnRpdGlvbih3aW5lJHF1YWxpdHksIHAgPSAwLjgsIGxpc3QgPSBGQUxTRSkKd2luZV90cmFpbiA8LSB3aW5lW3RyYWluSW5kZXgsIF0Kd2luZV90ZXN0IDwtIHdpbmVbLXRyYWluSW5kZXgsIF0KYGBgCgojIyBTdGVwIDQ6IFRyYWluaW5nIHRoZSBSZWdyZXNzaW9uIFRyZWUgTW9kZWwKYGBge3J9CiMgTG9hZCBuZWNlc3NhcnkgbGlicmFyeSBmb3IgcmVncmVzc2lvbiB0cmVlcwpsaWJyYXJ5KHJwYXJ0KQoKIyBUcmFpbiBhIHJlZ3Jlc3Npb24gdHJlZSBtb2RlbCB0byBwcmVkaWN0IHdpbmUgcXVhbGl0eQp3aW5lX21vZGVsIDwtIHJwYXJ0KHF1YWxpdHkgfiAuLCBkYXRhID0gd2luZV90cmFpbiwgbWV0aG9kID0gImFub3ZhIikKCiMgRGlzcGxheSB0aGUgc3RydWN0dXJlIG9mIHRoZSB0cmVlCnByaW50KHdpbmVfbW9kZWwpCmBgYAoKYGBge3J9CiMgVmlzdWFsaXplIHRoZSByZWdyZXNzaW9uIHRyZWUgc3RydWN0dXJlCmxpYnJhcnkocnBhcnQucGxvdCkKcnBhcnQucGxvdCh3aW5lX21vZGVsLCBtYWluID0gIlJlZ3Jlc3Npb24gVHJlZSBmb3IgV2luZSBRdWFsaXR5IikKYGBgCgojIyBTdGVwIDU6IEV2YWx1YXRpbmcgdGhlIE1vZGVsCmBgYHtyfQojIE1ha2UgcHJlZGljdGlvbnMgb24gdGhlIHRlc3Qgc2V0CndpbmVfcHJlZCA8LSBwcmVkaWN0KHdpbmVfbW9kZWwsIHdpbmVfdGVzdCkKCiMgQ2FsY3VsYXRlIFJvb3QgTWVhbiBTcXVhcmVkIEVycm9yIChSTVNFKSB0byBhc3Nlc3MgbW9kZWwgcGVyZm9ybWFuY2UKcm1zZSA8LSBzcXJ0KG1lYW4oKHdpbmVfdGVzdCRxdWFsaXR5IC0gd2luZV9wcmVkKV4yKSkKcHJpbnQocGFzdGUoIlJvb3QgTWVhbiBTcXVhcmVkIEVycm9yIChSTVNFKToiLCByb3VuZChybXNlLCAyKSkpCmBgYAoKIyMgU3RlcCA2OiBJbXByb3ZpbmcgdGhlIE1vZGVsIChPcHRpb25hbCkKYGBge3J9CiMgUHJ1bmUgdGhlIHRyZWUgdG8gcmVkdWNlIG92ZXJmaXR0aW5nCnBydW5lZF9tb2RlbCA8LSBwcnVuZSh3aW5lX21vZGVsLCBjcCA9IDAuMDEpICAjIEFkanVzdCBjb21wbGV4aXR5IHBhcmFtZXRlciBhcyBuZWVkZWQKcnBhcnQucGxvdChwcnVuZWRfbW9kZWwsIG1haW4gPSAiUHJ1bmVkIFJlZ3Jlc3Npb24gVHJlZSIpCmBgYAoKYGBge3J9CiMgRXZhbHVhdGUgdGhlIHBydW5lZCBtb2RlbAp3aW5lX3ByZWRfcHJ1bmVkIDwtIHByZWRpY3QocHJ1bmVkX21vZGVsLCB3aW5lX3Rlc3QpCnJtc2VfcHJ1bmVkIDwtIHNxcnQobWVhbigod2luZV90ZXN0JHF1YWxpdHkgLSB3aW5lX3ByZWRfcHJ1bmVkKV4yKSkKcHJpbnQocGFzdGUoIlBydW5lZCBNb2RlbCBSTVNFOiIsIHJvdW5kKHJtc2VfcHJ1bmVkLCAyKSkpCmBgYAoKIyBDb21wYXJpbmcgUk1TRSBiZWZvcmUgYW5kIGFmdGVyIHBydW5pbmcgaGVscHMgZGV0ZXJtaW5lIGlmIHBydW5pbmcgaW1wcm92ZXMgdGhlIG1vZGVsLg==