##Chapter 6: Regression Methods #Part 1: Linear Regression #Understanding Regression
getwd()
[1] "/cloud/project"
launch <- read.csv("challenger.csv")
# Estimate beta manually
b <- cov(launch$temperature, launch$distress_ct) / var(launch$temperature)
b
[1] -0.04753968
# Estimate alpha manually
a <- mean(launch$distress_ct) - b*mean(launch$temperature)
a
[1] 3.698413
#Calculate the correlation of Launch Data
r <- cov(launch$temperature, launch$distress_ct) / (sd(launch$temperature)*sd(launch$distress_ct))
r
[1] -0.5111264
cor(launch$temperature, launch$distress_ct)
[1] -0.5111264
# Computing the slope using correlation
r * (sd(launch$distress_ct) / sd(launch$temperature))
[1] -0.04753968
#Confirming the regression line using the lm function (not in text)
model <- lm(distress_ct ~ temperature, data = launch)
model
Call:
lm(formula = distress_ct ~ temperature, data = launch)
Coefficients:
(Intercept) temperature
3.69841 -0.04754
summary(model)
Call:
lm(formula = distress_ct ~ temperature, data = launch)
Residuals:
Min 1Q Median 3Q Max
-0.5608 -0.3944 -0.0854 0.1056 1.8671
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.69841 1.21951 3.033 0.00633 **
temperature -0.04754 0.01744 -2.725 0.01268 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.5774 on 21 degrees of freedom
Multiple R-squared: 0.2613, Adjusted R-squared: 0.2261
F-statistic: 7.426 on 1 and 21 DF, p-value: 0.01268
#Creating a simple multiple regression function
reg <- function(y,x) {
x <- as.matrix(x)
x <- cbind(Interpret = 1,x)
b <- solve(t(x) %*% x) %*% t(x) %*% y
colnames(b) <- "estimate"
print(b)
}
#Examine the launch Data
str(launch)
'data.frame': 23 obs. of 4 variables:
$ distress_ct : int 0 1 0 0 0 0 0 0 1 1 ...
$ temperature : int 66 70 69 68 67 72 73 70 57 63 ...
$ field_check_pressure: int 50 50 50 50 50 50 100 100 200 200 ...
$ flight_num : int 1 2 3 4 5 6 7 8 9 10 ...
#Test regression model with Simple Linear Regression
reg(y = launch$distress_ct, x = launch[2])
estimate
Interpret 3.69841270
temperature -0.04753968
# Use Regression Model with Multiple Regression
reg(y = launch$distress_ct, x = launch[2:4])
estimate
Interpret 3.527093383
temperature -0.051385940
field_check_pressure 0.001757009
flight_num 0.014292843
#Confirming the multiple Regression result using the lm function (no t in text)
model <- lm(distress_ct ~ temperature + field_check_pressure + flight_num, data = launch)
model
Call:
lm(formula = distress_ct ~ temperature + field_check_pressure +
flight_num, data = launch)
Coefficients:
(Intercept) temperature field_check_pressure
3.527093 -0.051386 0.001757
flight_num
0.014293
summary(model)
Call:
lm(formula = distress_ct ~ temperature + field_check_pressure +
flight_num, data = launch)
Residuals:
Min 1Q Median 3Q Max
-0.65003 -0.24414 -0.11219 0.01279 1.67530
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.527093 1.307024 2.699 0.0142 *
temperature -0.051386 0.018341 -2.802 0.0114 *
field_check_pressure 0.001757 0.003402 0.517 0.6115
flight_num 0.014293 0.035138 0.407 0.6887
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.565 on 19 degrees of freedom
Multiple R-squared: 0.36, Adjusted R-squared: 0.259
F-statistic: 3.563 on 3 and 19 DF, p-value: 0.03371
##Predicting Medical Expenses
## Step 2: Exploring and preparing the data
insurance <- read.csv("insurance.csv", stringsAsFactors = TRUE)
str(insurance)
'data.frame': 1338 obs. of 7 variables:
$ age : int 19 18 28 33 32 31 46 37 37 60 ...
$ sex : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
$ bmi : num 27.9 33.8 33 22.7 28.9 25.7 33.4 27.7 29.8 25.8 ...
$ children: int 0 1 3 0 0 0 1 3 2 0 ...
$ smoker : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
$ region : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
$ expenses: num 16885 1726 4449 21984 3867 ...
#Summarize the charges variable
summary(insurance$expenses)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1122 4740 9382 13270 16640 63770
# Histogram of insurance charges
hist(insurance$expenses)
table(insurance$region)
northeast northwest southeast southwest
324 325 364 325
# Exploring relationships among features: correlation matrix
cor(insurance[c("age","bmi","children","expenses")])
age bmi children expenses
age 1.0000000 0.10934101 0.04246900 0.29900819
bmi 0.1093410 1.00000000 0.01264471 0.19857626
children 0.0424690 0.01264471 1.00000000 0.06799823
expenses 0.2990082 0.19857626 0.06799823 1.00000000
#Visualizing relationships among features: Scatterplot Matrix
pairs(insurance[c("age","bmi","children","expenses")])
#Step 3: Training a model on the data ---
ins_model <- lm(expenses ~ age + children + bmi + sex + smoker + region,
data = insurance)
ins_model <- lm(expenses ~ ., data = insurance) # this is equivalent to above
# see the estimated beta coefficients
ins_model
Call:
lm(formula = expenses ~ ., data = insurance)
Coefficients:
(Intercept) age sexmale
-11941.6 256.8 -131.4
bmi children smokeryes
339.3 475.7 23847.5
regionnorthwest regionsoutheast regionsouthwest
-352.8 -1035.6 -959.3
##Step 4: Evaluating Model Performance
# see more detail about the estimated beta coefficients
summary(ins_model)
Call:
lm(formula = expenses ~ ., data = insurance)
Residuals:
Min 1Q Median 3Q Max
-11302.7 -2850.9 -979.6 1383.9 29981.7
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -11941.6 987.8 -12.089 < 2e-16 ***
age 256.8 11.9 21.586 < 2e-16 ***
sexmale -131.3 332.9 -0.395 0.693255
bmi 339.3 28.6 11.864 < 2e-16 ***
children 475.7 137.8 3.452 0.000574 ***
smokeryes 23847.5 413.1 57.723 < 2e-16 ***
regionnorthwest -352.8 476.3 -0.741 0.458976
regionsoutheast -1035.6 478.7 -2.163 0.030685 *
regionsouthwest -959.3 477.9 -2.007 0.044921 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6062 on 1329 degrees of freedom
Multiple R-squared: 0.7509, Adjusted R-squared: 0.7494
F-statistic: 500.9 on 8 and 1329 DF, p-value: < 2.2e-16
##Step 5: Improving Model Performance
# add a higher-order "age" term
insurance$age2 <- insurance$age^2
# add an indicator for BMI >= 30
insurance$bmi30 <- ifelse(insurance$bmi >= 30,1,0)
# Create Final model
ins_model2 <- lm(expenses ~ age + age2 + children + bmi + sex + bmi30*smoker + region, data = insurance)
summary(ins_model2)
Call:
lm(formula = expenses ~ age + age2 + children + bmi + sex + bmi30 *
smoker + region, data = insurance)
Residuals:
Min 1Q Median 3Q Max
-17297.1 -1656.0 -1262.7 -727.8 24161.6
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 139.0053 1363.1359 0.102 0.918792
age -32.6181 59.8250 -0.545 0.585690
age2 3.7307 0.7463 4.999 6.54e-07 ***
children 678.6017 105.8855 6.409 2.03e-10 ***
bmi 119.7715 34.2796 3.494 0.000492 ***
sexmale -496.7690 244.3713 -2.033 0.042267 *
bmi30 -997.9355 422.9607 -2.359 0.018449 *
smokeryes 13404.5952 439.9591 30.468 < 2e-16 ***
regionnorthwest -279.1661 349.2826 -0.799 0.424285
regionsoutheast -828.0345 351.6484 -2.355 0.018682 *
regionsouthwest -1222.1619 350.5314 -3.487 0.000505 ***
bmi30:smokeryes 19810.1534 604.6769 32.762 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4445 on 1326 degrees of freedom
Multiple R-squared: 0.8664, Adjusted R-squared: 0.8653
F-statistic: 781.7 on 11 and 1326 DF, p-value: < 2.2e-16
# Making predictions with the Regression Model
insurance$pred <- predict(ins_model2, insurance)
cor(insurance$pred, insurance$expenses)
[1] 0.9307999
plot(insurance$pred, insurance$expenses)
abline(a = 0, b = 1, col = "red", lwd = 3, lty = 2)
predict(ins_model2,
data.frame(age=21, age2=21^2,children=3,
bmi=24,sex="female",bmi30=0,
smoker="no",region="northwest"))
1
5730.437
predict(ins_model2,
data.frame(age=22,age2=22^2,children=1,
bmi=27,sex="male",bmi30=0,
smoker="yes",region="southeast"))
1
17219.31
##Understanding Regression Trees and Model Trees
##Example: Calculating SDR
# Set up the data
tee <- c(1,1,1,2,2,3,4,5,5,6,6,7,7,7,7)
at1 <- c(1,1,1,2,2,3,4,5,5)
at2 <- c(6,6,7,7,7,7)
bt1 <- c(1,1,1,2,2,3,4)
bt2 <- c(5,5,6,6,7,7,7,7)
#Compute the SDR
sdr_a <- sd(tee) - (length(at1) / length(tee) * sd(at1) + length(at2) / length(tee) * sd(at2))
sdr_b <- sd(tee) - (length(bt1) / length(tee) * sd(bt1) + length(bt2) / length(tee) * sd(bt2))
#Compare the SDR for each split
sdr_a
[1] 1.202815
sdr_b
[1] 1.392751
##Exercise No 3: Estimating Wine Quality
##Step 2: Exploring and Preparing the Data
wine <- read.csv("winequality-red.csv")
# Examine the wine Data
str(wine)
'data.frame': 1599 obs. of 12 variables:
$ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
$ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
$ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
$ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
$ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
$ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
$ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
$ density : num 0.998 0.997 0.997 0.998 0.998 ...
$ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
$ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
$ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
$ quality : int 5 5 5 6 5 5 5 7 7 5 ...
# The Distribution of Quality Ratings
hist(wine$quality)
summary(wine)
fixed.acidity volatile.acidity citric.acid
Min. : 4.60 Min. :0.1200 Min. :0.000
1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090
Median : 7.90 Median :0.5200 Median :0.260
Mean : 8.32 Mean :0.5278 Mean :0.271
3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420
Max. :15.90 Max. :1.5800 Max. :1.000
residual.sugar chlorides free.sulfur.dioxide
Min. : 0.900 Min. :0.01200 Min. : 1.00
1st Qu.: 1.900 1st Qu.:0.07000 1st Qu.: 7.00
Median : 2.200 Median :0.07900 Median :14.00
Mean : 2.539 Mean :0.08747 Mean :15.87
3rd Qu.: 2.600 3rd Qu.:0.09000 3rd Qu.:21.00
Max. :15.500 Max. :0.61100 Max. :72.00
total.sulfur.dioxide density pH
Min. : 6.00 Min. :0.9901 Min. :2.740
1st Qu.: 22.00 1st Qu.:0.9956 1st Qu.:3.210
Median : 38.00 Median :0.9968 Median :3.310
Mean : 46.47 Mean :0.9967 Mean :3.311
3rd Qu.: 62.00 3rd Qu.:0.9978 3rd Qu.:3.400
Max. :289.00 Max. :1.0037 Max. :4.010
sulphates alcohol quality
Min. :0.3300 Min. : 8.40 Min. :3.000
1st Qu.:0.5500 1st Qu.: 9.50 1st Qu.:5.000
Median :0.6200 Median :10.20 Median :6.000
Mean :0.6581 Mean :10.42 Mean :5.636
3rd Qu.:0.7300 3rd Qu.:11.10 3rd Qu.:6.000
Max. :2.0000 Max. :14.90 Max. :8.000
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]
##Step 3: Training a model on the data
#Regression Tree using rpart
library(rpart)
m.rpart <- rpart(quality ~ ., data = wine_train)
# get basic information about the tree
m.rpart
n=1599 (2151 observations deleted due to missingness)
node), split, n, deviance, yval
* denotes terminal node
1) root 1599 1042.16500 5.636023
2) alcohol< 10.525 983 424.15870 5.366226
4) sulphates< 0.575 391 128.09720 5.150895 *
5) sulphates>=0.575 592 265.95780 5.508446
10) volatile.acidity>=0.405 448 175.87280 5.404018 *
11) volatile.acidity< 0.405 144 70.00000 5.833333 *
3) alcohol>=10.525 616 432.27110 6.066558
6) sulphates< 0.645 272 191.86760 5.727941
12) volatile.acidity>=1.015 10 6.00000 4.000000 *
13) volatile.acidity< 1.015 262 154.87020 5.793893
26) volatile.acidity>=0.495 146 73.67123 5.575342 *
27) volatile.acidity< 0.495 116 65.44828 6.068966 *
7) sulphates>=0.645 344 184.55520 6.334302
14) alcohol< 11.55 206 101.96600 6.121359
28) volatile.acidity>=0.395 111 37.42342 5.927928 *
29) volatile.acidity< 0.395 95 55.53684 6.347368
58) pH>=3.255 59 29.72881 6.067797 *
59) pH< 3.255 36 13.63889 6.805556 *
15) alcohol>=11.55 138 59.30435 6.652174 *
# get more detailed information about the tree
summary(m.rpart)
Call:
rpart(formula = quality ~ ., data = wine_train)
n=1599 (2151 observations deleted due to missingness)
CP nsplit rel error xerror xstd
1 0.17822061 0 1.0000000 1.0008755 0.03787612
2 0.05358865 1 0.8217794 0.8279516 0.03589512
3 0.02974329 2 0.7681907 0.7896176 0.03310122
4 0.02888577 3 0.7384474 0.7905807 0.03318766
5 0.02234278 4 0.7095617 0.7523823 0.03103167
6 0.01927238 5 0.6872189 0.7375748 0.03060662
7 0.01511346 6 0.6679465 0.7282584 0.02982301
8 0.01015909 7 0.6528331 0.7123437 0.02865414
9 0.01000000 9 0.6325149 0.7086338 0.02828159
Variable importance
alcohol volatile.acidity density
31 17 13
sulphates fixed.acidity chlorides
13 7 6
citric.acid pH total.sulfur.dioxide
5 4 3
free.sulfur.dioxide
1
Node number 1: 1599 observations, complexity param=0.1782206
mean=5.636023, MSE=0.6517605
left son=2 (983 obs) right son=3 (616 obs)
Primary splits:
alcohol < 10.525 to the left, improve=0.17822060, (0 missing)
sulphates < 0.645 to the left, improve=0.12565160, (0 missing)
volatile.acidity < 0.425 to the right, improve=0.11400620, (0 missing)
citric.acid < 0.295 to the left, improve=0.07225368, (0 missing)
density < 0.99539 to the right, improve=0.06402980, (0 missing)
Surrogate splits:
density < 0.995575 to the right, agree=0.762, adj=0.383, (0 split)
chlorides < 0.0685 to the right, agree=0.690, adj=0.195, (0 split)
volatile.acidity < 0.3675 to the right, agree=0.662, adj=0.123, (0 split)
fixed.acidity < 6.75 to the right, agree=0.654, adj=0.101, (0 split)
total.sulfur.dioxide < 17.5 to the right, agree=0.641, adj=0.068, (0 split)
Node number 2: 983 observations, complexity param=0.02888577
mean=5.366226, MSE=0.4314941
left son=4 (391 obs) right son=5 (592 obs)
Primary splits:
sulphates < 0.575 to the left, improve=0.07097282, (0 missing)
volatile.acidity < 0.335 to the right, improve=0.06388554, (0 missing)
alcohol < 9.85 to the left, improve=0.05212216, (0 missing)
fixed.acidity < 10.85 to the left, improve=0.03084011, (0 missing)
total.sulfur.dioxide < 83.5 to the right, improve=0.02749674, (0 missing)
Surrogate splits:
density < 0.996225 to the left, agree=0.662, adj=0.151, (0 split)
volatile.acidity < 0.6525 to the right, agree=0.636, adj=0.084, (0 split)
fixed.acidity < 6.05 to the left, agree=0.609, adj=0.018, (0 split)
citric.acid < 0.115 to the left, agree=0.609, adj=0.018, (0 split)
total.sulfur.dioxide < 9.5 to the left, agree=0.608, adj=0.015, (0 split)
Node number 3: 616 observations, complexity param=0.05358865
mean=6.066558, MSE=0.7017388
left son=6 (272 obs) right son=7 (344 obs)
Primary splits:
sulphates < 0.645 to the left, improve=0.12919720, (0 missing)
volatile.acidity < 0.87 to the right, improve=0.11482610, (0 missing)
citric.acid < 0.295 to the left, improve=0.10819510, (0 missing)
alcohol < 11.55 to the left, improve=0.10309310, (0 missing)
pH < 3.355 to the right, improve=0.07557599, (0 missing)
Surrogate splits:
citric.acid < 0.245 to the left, agree=0.683, adj=0.283, (0 split)
fixed.acidity < 7.85 to the left, agree=0.666, adj=0.243, (0 split)
volatile.acidity < 0.5875 to the right, agree=0.653, adj=0.213, (0 split)
density < 0.994915 to the left, agree=0.635, adj=0.173, (0 split)
pH < 3.405 to the right, agree=0.630, adj=0.162, (0 split)
Node number 4: 391 observations
mean=5.150895, MSE=0.3276143
Node number 5: 592 observations, complexity param=0.01927238
mean=5.508446, MSE=0.449253
left son=10 (448 obs) right son=11 (144 obs)
Primary splits:
volatile.acidity < 0.405 to the right, improve=0.07551952, (0 missing)
total.sulfur.dioxide < 81.5 to the right, improve=0.05845854, (0 missing)
alcohol < 9.85 to the left, improve=0.05386312, (0 missing)
fixed.acidity < 10.95 to the left, improve=0.05335172, (0 missing)
chlorides < 0.0975 to the right, improve=0.03262428, (0 missing)
Surrogate splits:
fixed.acidity < 10.45 to the left, agree=0.787, adj=0.125, (0 split)
chlorides < 0.0565 to the right, agree=0.765, adj=0.035, (0 split)
citric.acid < 0.365 to the left, agree=0.764, adj=0.028, (0 split)
free.sulfur.dioxide < 2.5 to the right, agree=0.764, adj=0.028, (0 split)
alcohol < 8.6 to the right, agree=0.758, adj=0.007, (0 split)
Node number 6: 272 observations, complexity param=0.02974329
mean=5.727941, MSE=0.7053958
left son=12 (10 obs) right son=13 (262 obs)
Primary splits:
volatile.acidity < 1.015 to the right, improve=0.16155630, (0 missing)
alcohol < 11.45 to the left, improve=0.11901850, (0 missing)
citric.acid < 0.255 to the left, improve=0.11313180, (0 missing)
pH < 3.365 to the right, improve=0.09055459, (0 missing)
sulphates < 0.585 to the left, improve=0.04970438, (0 missing)
Node number 7: 344 observations, complexity param=0.02234278
mean=6.334302, MSE=0.5364978
left son=14 (206 obs) right son=15 (138 obs)
Primary splits:
alcohol < 11.55 to the left, improve=0.12616750, (0 missing)
chlorides < 0.0785 to the right, improve=0.05765389, (0 missing)
total.sulfur.dioxide < 101.5 to the right, improve=0.05496021, (0 missing)
density < 0.99537 to the right, improve=0.04412990, (0 missing)
volatile.acidity < 0.425 to the right, improve=0.04136603, (0 missing)
Surrogate splits:
density < 0.994875 to the right, agree=0.701, adj=0.254, (0 split)
chlorides < 0.053 to the right, agree=0.651, adj=0.130, (0 split)
fixed.acidity < 5.55 to the right, agree=0.640, adj=0.101, (0 split)
residual.sugar < 4.25 to the left, agree=0.628, adj=0.072, (0 split)
citric.acid < 0.635 to the left, agree=0.622, adj=0.058, (0 split)
Node number 10: 448 observations
mean=5.404018, MSE=0.3925731
Node number 11: 144 observations
mean=5.833333, MSE=0.4861111
Node number 12: 10 observations
mean=4, MSE=0.6
Node number 13: 262 observations, complexity param=0.01511346
mean=5.793893, MSE=0.5911077
left son=26 (146 obs) right son=27 (116 obs)
Primary splits:
volatile.acidity < 0.495 to the right, improve=0.10170270, (0 missing)
alcohol < 11.45 to the left, improve=0.09838534, (0 missing)
citric.acid < 0.255 to the left, improve=0.09415346, (0 missing)
pH < 3.295 to the right, improve=0.07618253, (0 missing)
density < 0.995155 to the right, improve=0.05214905, (0 missing)
Surrogate splits:
citric.acid < 0.235 to the left, agree=0.866, adj=0.698, (0 split)
pH < 3.305 to the right, agree=0.733, adj=0.397, (0 split)
fixed.acidity < 7.85 to the left, agree=0.691, adj=0.302, (0 split)
alcohol < 11.85 to the left, agree=0.641, adj=0.190, (0 split)
total.sulfur.dioxide < 12.5 to the right, agree=0.637, adj=0.181, (0 split)
Node number 14: 206 observations, complexity param=0.01015909
mean=6.121359, MSE=0.4949807
left son=28 (111 obs) right son=29 (95 obs)
Primary splits:
volatile.acidity < 0.395 to the right, improve=0.08832113, (0 missing)
total.sulfur.dioxide < 49.5 to the right, improve=0.06808035, (0 missing)
chlorides < 0.0945 to the right, improve=0.05079896, (0 missing)
citric.acid < 0.295 to the left, improve=0.05051307, (0 missing)
free.sulfur.dioxide < 25.5 to the right, improve=0.03611908, (0 missing)
Surrogate splits:
citric.acid < 0.285 to the left, agree=0.733, adj=0.421, (0 split)
sulphates < 0.765 to the left, agree=0.655, adj=0.253, (0 split)
chlorides < 0.0675 to the right, agree=0.617, adj=0.168, (0 split)
residual.sugar < 1.85 to the right, agree=0.612, adj=0.158, (0 split)
fixed.acidity < 7.05 to the left, agree=0.597, adj=0.126, (0 split)
Node number 15: 138 observations
mean=6.652174, MSE=0.4297417
Node number 26: 146 observations
mean=5.575342, MSE=0.5045975
Node number 27: 116 observations
mean=6.068966, MSE=0.5642093
Node number 28: 111 observations
mean=5.927928, MSE=0.337148
Node number 29: 95 observations, complexity param=0.01015909
mean=6.347368, MSE=0.5845983
left son=58 (59 obs) right son=59 (36 obs)
Primary splits:
pH < 3.255 to the right, improve=0.21911830, (0 missing)
total.sulfur.dioxide < 56.5 to the right, improve=0.18528400, (0 missing)
fixed.acidity < 10 to the left, improve=0.12899290, (0 missing)
free.sulfur.dioxide < 24.5 to the right, improve=0.11666000, (0 missing)
alcohol < 10.75 to the left, improve=0.05498168, (0 missing)
Surrogate splits:
fixed.acidity < 9.7 to the left, agree=0.737, adj=0.306, (0 split)
total.sulfur.dioxide < 28.5 to the right, agree=0.737, adj=0.306, (0 split)
free.sulfur.dioxide < 9.5 to the right, agree=0.716, adj=0.250, (0 split)
chlorides < 0.0635 to the right, agree=0.663, adj=0.111, (0 split)
sulphates < 0.935 to the left, agree=0.663, adj=0.111, (0 split)
Node number 58: 59 observations
mean=6.067797, MSE=0.5038782
Node number 59: 36 observations
mean=6.805556, MSE=0.378858
install.packages("rpart.plot")
trying URL 'http://rspm/default/__linux__/focal/latest/src/contrib/rpart.plot_3.1.3.tar.gz'
Content type 'application/x-gzip' length 1014419 bytes (990 KB)
==================================================
downloaded 990 KB
The downloaded source packages are in
‘/tmp/Rtmp8iYKHc/downloaded_packages’
#Use the rpart.plot package to create a visualization
library(rpart.plot)
# a basic decision tree diagram
rpart.plot(m.rpart, digits = 3)
# a few adjustments to the diagram
rpart.plot(m.rpart, digits=4, fallen.leaves = TRUE, type=3, extra=101)
##Step 4: Evaluate Model Performance
# Generate Predictions for the testing dataset
p.rpart <- predict(m.rpart, wine_test)
#Compare the distribution of predicted values vs. actual values
summary(p.rpart)
Min. 1st Qu. Median Mean 3rd Qu. Max.
5.404 5.404 5.404 5.404 5.404 5.404
summary(wine_test$quality)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
NA NA NA NaN NA NA 1148
#Compare the correlation
cor(p.rpart, wine_test$quality)
[1] NA
#Function to calculate the mean absolute error
MAE <- function(actual,predicted) {
mean(abs(actual-predicted))
}
# Mean Absolute Error between predicted and actual values
MAE(p.rpart, wine_test$quality)
[1] NA
# Mean Absolute Error between actual values and mean value
mean(wine_train$quality) # result = 5.87
[1] NA
MAE(5.87, wine_test$quality)
[1] NA
##Step 5: Improving Model Performance
install.packages("plyr")
install.packages("Cubist")
trying URL 'http://rspm/default/__linux__/focal/latest/src/contrib/plyr_1.8.9.tar.gz'
trying URL 'http://rspm/default/__linux__/focal/latest/src/contrib/reshape2_1.4.5.tar.gz'
trying URL 'http://rspm/default/__linux__/focal/latest/src/contrib/Cubist_0.5.1.tar.gz'
The downloaded source packages are in
‘/tmp/Rtmp8iYKHc/downloaded_packages’
#train a Cubist Model Tree
library(Cubist)
m.cubist <- cubist(x = wine_train[-12], y=wine_train$quality)
#Display Basic information about the model tree
m.cubist
Call:
cubist.default(x = wine_train[-12], y = wine_train$quality)
Number of samples: 3750
Number of predictors: 11
Number of committees: 1
Number of rules: 7
# Display the tree itself
summary(m.cubist)
Call:
cubist.default(x = wine_train[-12], y = wine_train$quality)
Cubist [Release 2.07 GPL Edition] Tue Feb 3 01:25:38 2026
---------------------------------
Target attribute `outcome'
*** Ignoring cases with unknown or N/A target value
Read 1599 cases (12 attributes) from undefined.data
Model:
Rule 1: [630 cases, mean 5.3, range 3 to 8, est err 0.4]
if
alcohol <= 9.8
then
outcome = 5 - 0.79 volatile.acidity - 0.099 alcohol
+ 0.052 fixed.acidity - 0.31 citric.acid + 0.33 sulphates
+ 0.29 pH - 0.0031 free.sulfur.dioxide
- 0.0007 total.sulfur.dioxide - 0.4 chlorides
Rule 2: [589 cases, mean 5.3, range 3 to 8, est err 0.4]
if
sulphates <= 0.92
alcohol <= 9.8
then
outcome = 5.5 + 1.28 sulphates - 0.9 volatile.acidity - 0.33 citric.acid
+ 0.029 fixed.acidity - 0.033 alcohol
- 0.0008 total.sulfur.dioxide - 0.0023 free.sulfur.dioxide
- 0.4 chlorides - 0.1 pH
Rule 3: [80 cases, mean 5.3, range 3 to 7, est err 0.7]
if
volatile.acidity > 0.31
total.sulfur.dioxide <= 14
sulphates <= 0.63
alcohol > 9.8
then
outcome = 0.5 + 0.549 alcohol - 1.61 volatile.acidity + 0.36 sulphates
- 0.18 pH - 0.0005 total.sulfur.dioxide - 0.07 citric.acid
+ 0.001 free.sulfur.dioxide
Rule 4: [340 cases, mean 5.6, range 4 to 7, est err 0.5]
if
volatile.acidity > 0.31
total.sulfur.dioxide > 14
sulphates <= 0.63
alcohol > 9.8
then
outcome = 5.1 + 2.85 sulphates + 0.19 alcohol - 0.74 citric.acid
- 0.69 volatile.acidity - 0.74 pH
- 0.0027 total.sulfur.dioxide + 0.0013 free.sulfur.dioxide
Rule 5: [407 cases, mean 6.1, range 3 to 8, est err 0.6]
if
volatile.acidity > 0.31
sulphates > 0.63
alcohol > 9.8
then
outcome = 7.6 + 0.309 alcohol - 0.0073 total.sulfur.dioxide - 1.12 pH
- 0.81 volatile.acidity - 0.079 fixed.acidity + 0.22 sulphates
+ 0.002 free.sulfur.dioxide
Rule 6: [71 cases, mean 6.2, range 5 to 8, est err 0.5]
if
volatile.acidity <= 0.31
sulphates <= 0.73
alcohol > 9.8
then
outcome = 131.4 + 4.85 volatile.acidity - 124 density - 1.35 pH
+ 0.056 fixed.acidity + 0.54 sulphates + 0.036 alcohol
+ 0.021 residual.sugar
Rule 7: [85 cases, mean 6.5, range 5 to 8, est err 0.4]
if
volatile.acidity <= 0.31
sulphates > 0.73
then
outcome = 17 + 0.39 alcohol + 0.113 fixed.acidity
+ 0.25 volatile.acidity - 16 density + 0.14 sulphates
Evaluation on training data (1599 cases):
Average |error| 0.4
Relative |error| 0.62
Correlation coefficient 0.62
Attribute usage:
Conds Model
96% 100% alcohol
71% 100% sulphates
45% 100% volatile.acidity
19% 93% total.sulfur.dioxide
96% pH
93% free.sulfur.dioxide
81% fixed.acidity
74% citric.acid
55% chlorides
7% density
3% residual.sugar
Time: 0.0 secs
# Generate predictions for the model
p.cubist <- predict(m.cubist, wine_test)
# sumarry statistic about the predictions
summary(p.cubist)
Min. 1st Qu. Median Mean 3rd Qu. Max.
5.865 5.865 5.865 5.865 5.865 5.865
# correlation between the predicted and true values
cor(p.cubist, wine_test$quality)
[1] NA
# mean absolute error of predicted and true values
# (uses a custom function defined above)
MAE(wine_test$quality, p.cubist)
[1] NA