This is an
launch <- read.csv("challenger2.csv")
b <- cov(launch$temperature, launch$distress_ct) / var(launch$temperature)
b
[1] -0.03364796
cor(launch$temperature, launch$distress_ct)
[1] -0.3359996
r * (sd(launch$distress_ct) / sd(launch$temperature))
[1] -0.05118565
summary(model)
Call:
lm(formula = distress_ct ~ temperature + field_check_pressure +
flight_num, data = launch)
Residuals:
Min 1Q Median 3Q Max
-0.65003 -0.24414 -0.11219 0.01279 1.67530
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 3.527093 1.307024 2.699 0.0142 *
temperature -0.051386 0.018341 -2.802 0.0114 *
field_check_pressure 0.001757 0.003402 0.517 0.6115
flight_num 0.014293 0.035138 0.407 0.6887
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.565 on 19 degrees of freedom
Multiple R-squared: 0.36, Adjusted R-squared: 0.259
F-statistic: 3.563 on 3 and 19 DF, p-value: 0.03371
reg <- function(y, x) {
x <- as.matrix(x)
x <- cbind(Intercept = 1, x)
b <- solve(t(x) %*% x) %*% t(x) %*% y
colnames(b) <- "estimate"
print(b)
}
str(launch)
'data.frame': 29 obs. of 4 variables:
$ distress_ct : int 0 1 0 0 0 0 0 0 1 1 ...
$ temperature : int 66 70 69 68 67 72 73 70 57 63 ...
$ field_check_pressure: int 50 50 50 50 50 50 100 100 200 200 ...
$ flight_num : int 1 2 3 4 5 6 7 8 9 10 ...
reg(y = launch$distress_ct, x = launch[2])
estimate
Intercept 2.81458456
temperature -0.03364796
model <- lm(distress_ct ~ temperature + field_check_pressure + flight_num, data = launch)
model
Call:
lm(formula = distress_ct ~ temperature + field_check_pressure +
flight_num, data = launch)
Coefficients:
(Intercept) temperature field_check_pressure flight_num
2.240e+00 -3.124e-02 -2.587e-05 2.762e-02
summary(model)
Call:
lm(formula = distress_ct ~ temperature + field_check_pressure +
flight_num, data = launch)
Residuals:
Min 1Q Median 3Q Max
-1.2744 -0.3335 -0.1657 0.2975 1.5284
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.240e+00 1.267e+00 1.767 0.0894 .
temperature -3.124e-02 1.787e-02 -1.748 0.0927 .
field_check_pressure -2.587e-05 2.383e-03 -0.011 0.9914
flight_num 2.762e-02 1.798e-02 1.537 0.1369
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.6926 on 25 degrees of freedom
Multiple R-squared: 0.2132, Adjusted R-squared: 0.1188
F-statistic: 2.259 on 3 and 25 DF, p-value: 0.1063
insurance <- read.csv("insurance.csv", stringsAsFactors = TRUE)
str(insurance)
'data.frame': 1338 obs. of 7 variables:
$ age : int 19 18 28 33 32 31 46 37 37 60 ...
$ sex : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
$ bmi : num 27.9 33.8 33 22.7 28.9 25.7 33.4 27.7 29.8 25.8 ...
$ children: int 0 1 3 0 0 0 1 3 2 0 ...
$ smoker : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
$ region : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
$ expenses: num 16885 1726 4449 21984 3867 ...
summary(insurance$expenses)
Min. 1st Qu. Median Mean 3rd Qu. Max.
1122 4740 9382 13270 16640 63770
hist(insurance$expenses)
table(insurance$region)
northeast northwest southeast southwest
324 325 364 325
cor(insurance[c("age", "bmi", "children", "expenses")])
age bmi children expenses
age 1.0000000 0.10934101 0.04246900 0.29900819
bmi 0.1093410 1.00000000 0.01264471 0.19857626
children 0.0424690 0.01264471 1.00000000 0.06799823
expenses 0.2990082 0.19857626 0.06799823 1.00000000
pairs(insurance[c("age", "bmi", "children", "expenses")])
ins_model <- lm(expenses ~ age + children + bmi + sex + smoker + region,
data = insurance)
ins_model <- lm(expenses ~ ., data = insurance)
ins_model
Call:
lm(formula = expenses ~ ., data = insurance)
Coefficients:
(Intercept) age sexmale bmi children smokeryes
-11941.6 256.8 -131.4 339.3 475.7 23847.5
regionnorthwest regionsoutheast regionsouthwest
-352.8 -1035.6 -959.3
summary(ins_model)
Call:
lm(formula = expenses ~ ., data = insurance)
Residuals:
Min 1Q Median 3Q Max
-11302.7 -2850.9 -979.6 1383.9 29981.7
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -11941.6 987.8 -12.089 < 2e-16 ***
age 256.8 11.9 21.586 < 2e-16 ***
sexmale -131.3 332.9 -0.395 0.693255
bmi 339.3 28.6 11.864 < 2e-16 ***
children 475.7 137.8 3.452 0.000574 ***
smokeryes 23847.5 413.1 57.723 < 2e-16 ***
regionnorthwest -352.8 476.3 -0.741 0.458976
regionsoutheast -1035.6 478.7 -2.163 0.030685 *
regionsouthwest -959.3 477.9 -2.007 0.044921 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 6062 on 1329 degrees of freedom
Multiple R-squared: 0.7509, Adjusted R-squared: 0.7494
F-statistic: 500.9 on 8 and 1329 DF, p-value: < 2.2e-16
insurance$age2 <- insurance$age^2
insurance$bmi30 <- ifelse(insurance$bmi >= 30, 1, 0)
ins_model2 <- lm(expenses ~ age + age2 + children + bmi + sex +
bmi30*smoker + region, data = insurance)
summary(ins_model2)
Call:
lm(formula = expenses ~ age + age2 + children + bmi + sex + bmi30 *
smoker + region, data = insurance)
Residuals:
Min 1Q Median 3Q Max
-17297.1 -1656.0 -1262.7 -727.8 24161.6
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 139.0053 1363.1359 0.102 0.918792
age -32.6181 59.8250 -0.545 0.585690
age2 3.7307 0.7463 4.999 6.54e-07 ***
children 678.6017 105.8855 6.409 2.03e-10 ***
bmi 119.7715 34.2796 3.494 0.000492 ***
sexmale -496.7690 244.3713 -2.033 0.042267 *
bmi30 -997.9355 422.9607 -2.359 0.018449 *
smokeryes 13404.5952 439.9591 30.468 < 2e-16 ***
regionnorthwest -279.1661 349.2826 -0.799 0.424285
regionsoutheast -828.0345 351.6484 -2.355 0.018682 *
regionsouthwest -1222.1619 350.5314 -3.487 0.000505 ***
bmi30:smokeryes 19810.1534 604.6769 32.762 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4445 on 1326 degrees of freedom
Multiple R-squared: 0.8664, Adjusted R-squared: 0.8653
F-statistic: 781.7 on 11 and 1326 DF, p-value: < 2.2e-16
insurance$pred <- predict(ins_model2, insurance)
cor(insurance$pred, insurance$expenses)
[1] 0.9307999
plot(insurance$pred, insurance$expenses)
abline(a = 0, b = 1, col = "red", lwd = 3, lty = 2)
predict(ins_model2,
data.frame(age = 30, age2 = 30^2, children = 2,
bmi = 30, sex = "male", bmi30 = 1,
smoker = "no", region = "northeast"))
1
5973.774
predict(ins_model2,
data.frame(age = 30, age2 = 30^2, children = 0,
bmi = 30, sex = "female", bmi30 = 1,
smoker = "no", region = "northeast"))
1
5113.34
tee <- c(1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 7, 7, 7)
at1 <- c(1, 1, 1, 2, 2, 3, 4, 5, 5)
at2 <- c(6, 6, 7, 7, 7, 7)
bt1 <- c(1, 1, 1, 2, 2, 3, 4)
bt2 <- c(5, 5, 6, 6, 7, 7, 7, 7)
sdr_a <- sd(tee) - (length(at1) / length(tee) * sd(at1) + length(at2) / length(tee) * sd(at2))
sdr_b <- sd(tee) - (length(bt1) / length(tee) * sd(bt1) + length(bt2) / length(tee) * sd(bt2))
sdr_a
[1] 1.202815
sdr_b
[1] 1.392751
wine <- read.csv("winequality-red.csv")
str(wine)
'data.frame': 1599 obs. of 12 variables:
$ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
$ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
$ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
$ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
$ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
$ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
$ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
$ density : num 0.998 0.997 0.997 0.998 0.998 ...
$ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
$ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
$ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
$ quality : int 5 5 5 6 5 5 5 7 7 5 ...
hist(wine$quality)
summary(wine)
fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900 Min. :0.01200
1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900 1st Qu.:0.07000
Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200 Median :0.07900
Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539 Mean :0.08747
3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600 3rd Qu.:0.09000
Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500 Max. :0.61100
free.sulfur.dioxide total.sulfur.dioxide density pH sulphates
Min. : 1.00 Min. : 6.00 Min. :0.9901 Min. :2.740 Min. :0.3300
1st Qu.: 7.00 1st Qu.: 22.00 1st Qu.:0.9956 1st Qu.:3.210 1st Qu.:0.5500
Median :14.00 Median : 38.00 Median :0.9968 Median :3.310 Median :0.6200
Mean :15.87 Mean : 46.47 Mean :0.9967 Mean :3.311 Mean :0.6581
3rd Qu.:21.00 3rd Qu.: 62.00 3rd Qu.:0.9978 3rd Qu.:3.400 3rd Qu.:0.7300
Max. :72.00 Max. :289.00 Max. :1.0037 Max. :4.010 Max. :2.0000
alcohol quality
Min. : 8.40 Min. :3.000
1st Qu.: 9.50 1st Qu.:5.000
Median :10.20 Median :6.000
Mean :10.42 Mean :5.636
3rd Qu.:11.10 3rd Qu.:6.000
Max. :14.90 Max. :8.000
wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]
library(rpart)
m.rpart <- rpart(quality ~ ., data = wine_train)
m.rpart
n=1599 (2151 observations deleted due to missingness)
node), split, n, deviance, yval
* denotes terminal node
1) root 1599 1042.16500 5.636023
2) alcohol< 10.525 983 424.15870 5.366226
4) sulphates< 0.575 391 128.09720 5.150895 *
5) sulphates>=0.575 592 265.95780 5.508446
10) volatile.acidity>=0.405 448 175.87280 5.404018 *
11) volatile.acidity< 0.405 144 70.00000 5.833333 *
3) alcohol>=10.525 616 432.27110 6.066558
6) sulphates< 0.645 272 191.86760 5.727941
12) volatile.acidity>=1.015 10 6.00000 4.000000 *
13) volatile.acidity< 1.015 262 154.87020 5.793893
26) volatile.acidity>=0.495 146 73.67123 5.575342 *
27) volatile.acidity< 0.495 116 65.44828 6.068966 *
7) sulphates>=0.645 344 184.55520 6.334302
14) alcohol< 11.55 206 101.96600 6.121359
28) volatile.acidity>=0.395 111 37.42342 5.927928 *
29) volatile.acidity< 0.395 95 55.53684 6.347368
58) pH>=3.255 59 29.72881 6.067797 *
59) pH< 3.255 36 13.63889 6.805556 *
15) alcohol>=11.55 138 59.30435 6.652174 *
summary(m.rpart)
Call:
rpart(formula = quality ~ ., data = wine_train)
n=1599 (2151 observations deleted due to missingness)
CP nsplit rel error xerror xstd
1 0.17822061 0 1.0000000 1.0017513 0.03790139
2 0.05358865 1 0.8217794 0.8385585 0.03678114
3 0.02974329 2 0.7681907 0.7940327 0.03360605
4 0.02888577 3 0.7384474 0.7939605 0.03300086
5 0.02234278 4 0.7095617 0.7736573 0.03211681
6 0.01927238 5 0.6872189 0.7511204 0.03070159
7 0.01511346 6 0.6679465 0.7318769 0.02955708
8 0.01015909 7 0.6528331 0.7249689 0.02925140
9 0.01000000 9 0.6325149 0.7140139 0.02911207
Variable importance
alcohol volatile.acidity density sulphates
31 17 13 13
fixed.acidity chlorides citric.acid pH
7 6 5 4
total.sulfur.dioxide free.sulfur.dioxide
3 1
Node number 1: 1599 observations, complexity param=0.1782206
mean=5.636023, MSE=0.6517605
left son=2 (983 obs) right son=3 (616 obs)
Primary splits:
alcohol < 10.525 to the left, improve=0.17822060, (0 missing)
sulphates < 0.645 to the left, improve=0.12565160, (0 missing)
volatile.acidity < 0.425 to the right, improve=0.11400620, (0 missing)
citric.acid < 0.295 to the left, improve=0.07225368, (0 missing)
density < 0.99539 to the right, improve=0.06402980, (0 missing)
Surrogate splits:
density < 0.995575 to the right, agree=0.762, adj=0.383, (0 split)
chlorides < 0.0685 to the right, agree=0.690, adj=0.195, (0 split)
volatile.acidity < 0.3675 to the right, agree=0.662, adj=0.123, (0 split)
fixed.acidity < 6.75 to the right, agree=0.654, adj=0.101, (0 split)
total.sulfur.dioxide < 17.5 to the right, agree=0.641, adj=0.068, (0 split)
Node number 2: 983 observations, complexity param=0.02888577
mean=5.366226, MSE=0.4314941
left son=4 (391 obs) right son=5 (592 obs)
Primary splits:
sulphates < 0.575 to the left, improve=0.07097282, (0 missing)
volatile.acidity < 0.335 to the right, improve=0.06388554, (0 missing)
alcohol < 9.85 to the left, improve=0.05212216, (0 missing)
fixed.acidity < 10.85 to the left, improve=0.03084011, (0 missing)
total.sulfur.dioxide < 83.5 to the right, improve=0.02749674, (0 missing)
Surrogate splits:
density < 0.996225 to the left, agree=0.662, adj=0.151, (0 split)
volatile.acidity < 0.6525 to the right, agree=0.636, adj=0.084, (0 split)
fixed.acidity < 6.05 to the left, agree=0.609, adj=0.018, (0 split)
citric.acid < 0.115 to the left, agree=0.609, adj=0.018, (0 split)
total.sulfur.dioxide < 9.5 to the left, agree=0.608, adj=0.015, (0 split)
Node number 3: 616 observations, complexity param=0.05358865
mean=6.066558, MSE=0.7017388
left son=6 (272 obs) right son=7 (344 obs)
Primary splits:
sulphates < 0.645 to the left, improve=0.12919720, (0 missing)
volatile.acidity < 0.87 to the right, improve=0.11482610, (0 missing)
citric.acid < 0.295 to the left, improve=0.10819510, (0 missing)
alcohol < 11.55 to the left, improve=0.10309310, (0 missing)
pH < 3.355 to the right, improve=0.07557599, (0 missing)
Surrogate splits:
citric.acid < 0.245 to the left, agree=0.683, adj=0.283, (0 split)
fixed.acidity < 7.85 to the left, agree=0.666, adj=0.243, (0 split)
volatile.acidity < 0.5875 to the right, agree=0.653, adj=0.213, (0 split)
density < 0.994915 to the left, agree=0.635, adj=0.173, (0 split)
pH < 3.405 to the right, agree=0.630, adj=0.162, (0 split)
Node number 4: 391 observations
mean=5.150895, MSE=0.3276143
Node number 5: 592 observations, complexity param=0.01927238
mean=5.508446, MSE=0.449253
left son=10 (448 obs) right son=11 (144 obs)
Primary splits:
volatile.acidity < 0.405 to the right, improve=0.07551952, (0 missing)
total.sulfur.dioxide < 81.5 to the right, improve=0.05845854, (0 missing)
alcohol < 9.85 to the left, improve=0.05386312, (0 missing)
fixed.acidity < 10.95 to the left, improve=0.05335172, (0 missing)
chlorides < 0.0975 to the right, improve=0.03262428, (0 missing)
Surrogate splits:
fixed.acidity < 10.45 to the left, agree=0.787, adj=0.125, (0 split)
chlorides < 0.0565 to the right, agree=0.765, adj=0.035, (0 split)
citric.acid < 0.365 to the left, agree=0.764, adj=0.028, (0 split)
free.sulfur.dioxide < 2.5 to the right, agree=0.764, adj=0.028, (0 split)
alcohol < 8.6 to the right, agree=0.758, adj=0.007, (0 split)
Node number 6: 272 observations, complexity param=0.02974329
mean=5.727941, MSE=0.7053958
left son=12 (10 obs) right son=13 (262 obs)
Primary splits:
volatile.acidity < 1.015 to the right, improve=0.16155630, (0 missing)
alcohol < 11.45 to the left, improve=0.11901850, (0 missing)
citric.acid < 0.255 to the left, improve=0.11313180, (0 missing)
pH < 3.365 to the right, improve=0.09055459, (0 missing)
sulphates < 0.585 to the left, improve=0.04970438, (0 missing)
Node number 7: 344 observations, complexity param=0.02234278
mean=6.334302, MSE=0.5364978
left son=14 (206 obs) right son=15 (138 obs)
Primary splits:
alcohol < 11.55 to the left, improve=0.12616750, (0 missing)
chlorides < 0.0785 to the right, improve=0.05765389, (0 missing)
total.sulfur.dioxide < 101.5 to the right, improve=0.05496021, (0 missing)
density < 0.99537 to the right, improve=0.04412990, (0 missing)
volatile.acidity < 0.425 to the right, improve=0.04136603, (0 missing)
Surrogate splits:
density < 0.994875 to the right, agree=0.701, adj=0.254, (0 split)
chlorides < 0.053 to the right, agree=0.651, adj=0.130, (0 split)
fixed.acidity < 5.55 to the right, agree=0.640, adj=0.101, (0 split)
residual.sugar < 4.25 to the left, agree=0.628, adj=0.072, (0 split)
citric.acid < 0.635 to the left, agree=0.622, adj=0.058, (0 split)
Node number 10: 448 observations
mean=5.404018, MSE=0.3925731
Node number 11: 144 observations
mean=5.833333, MSE=0.4861111
Node number 12: 10 observations
mean=4, MSE=0.6
Node number 13: 262 observations, complexity param=0.01511346
mean=5.793893, MSE=0.5911077
left son=26 (146 obs) right son=27 (116 obs)
Primary splits:
volatile.acidity < 0.495 to the right, improve=0.10170270, (0 missing)
alcohol < 11.45 to the left, improve=0.09838534, (0 missing)
citric.acid < 0.255 to the left, improve=0.09415346, (0 missing)
pH < 3.295 to the right, improve=0.07618253, (0 missing)
density < 0.995155 to the right, improve=0.05214905, (0 missing)
Surrogate splits:
citric.acid < 0.235 to the left, agree=0.866, adj=0.698, (0 split)
pH < 3.305 to the right, agree=0.733, adj=0.397, (0 split)
fixed.acidity < 7.85 to the left, agree=0.691, adj=0.302, (0 split)
alcohol < 11.85 to the left, agree=0.641, adj=0.190, (0 split)
total.sulfur.dioxide < 12.5 to the right, agree=0.637, adj=0.181, (0 split)
Node number 14: 206 observations, complexity param=0.01015909
mean=6.121359, MSE=0.4949807
left son=28 (111 obs) right son=29 (95 obs)
Primary splits:
volatile.acidity < 0.395 to the right, improve=0.08832113, (0 missing)
total.sulfur.dioxide < 49.5 to the right, improve=0.06808035, (0 missing)
chlorides < 0.0945 to the right, improve=0.05079896, (0 missing)
citric.acid < 0.295 to the left, improve=0.05051307, (0 missing)
free.sulfur.dioxide < 25.5 to the right, improve=0.03611908, (0 missing)
Surrogate splits:
citric.acid < 0.285 to the left, agree=0.733, adj=0.421, (0 split)
sulphates < 0.765 to the left, agree=0.655, adj=0.253, (0 split)
chlorides < 0.0675 to the right, agree=0.617, adj=0.168, (0 split)
residual.sugar < 1.85 to the right, agree=0.612, adj=0.158, (0 split)
fixed.acidity < 7.05 to the left, agree=0.597, adj=0.126, (0 split)
Node number 15: 138 observations
mean=6.652174, MSE=0.4297417
Node number 26: 146 observations
mean=5.575342, MSE=0.5045975
Node number 27: 116 observations
mean=6.068966, MSE=0.5642093
Node number 28: 111 observations
mean=5.927928, MSE=0.337148
Node number 29: 95 observations, complexity param=0.01015909
mean=6.347368, MSE=0.5845983
left son=58 (59 obs) right son=59 (36 obs)
Primary splits:
pH < 3.255 to the right, improve=0.21911830, (0 missing)
total.sulfur.dioxide < 56.5 to the right, improve=0.18528400, (0 missing)
fixed.acidity < 10 to the left, improve=0.12899290, (0 missing)
free.sulfur.dioxide < 24.5 to the right, improve=0.11666000, (0 missing)
alcohol < 10.75 to the left, improve=0.05498168, (0 missing)
Surrogate splits:
fixed.acidity < 9.7 to the left, agree=0.737, adj=0.306, (0 split)
total.sulfur.dioxide < 28.5 to the right, agree=0.737, adj=0.306, (0 split)
free.sulfur.dioxide < 9.5 to the right, agree=0.716, adj=0.250, (0 split)
chlorides < 0.0635 to the right, agree=0.663, adj=0.111, (0 split)
sulphates < 0.935 to the left, agree=0.663, adj=0.111, (0 split)
Node number 58: 59 observations
mean=6.067797, MSE=0.5038782
Node number 59: 36 observations
mean=6.805556, MSE=0.378858
install.packages("rpart.plot")
trying URL 'http://rspm/default/__linux__/focal/latest/src/contrib/rpart.plot_3.1.3.tar.gz'
Content type 'application/x-gzip' length 1014419 bytes (990 KB)
==================================================
downloaded 990 KB
The downloaded source packages are in
‘/tmp/Rtmpom6lTf/downloaded_packages’
library(rpart.plot)
rpart.plot(m.rpart, digits = 3)
rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101)
p.rpart <- predict(m.rpart, wine_test)
summary(p.rpart)
Min. 1st Qu. Median Mean 3rd Qu. Max.
5.404 5.404 5.404 5.404 5.404 5.404
summary(wine_test$quality)
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
NA NA NA NaN NA NA 1148
cor(p.rpart, wine_test$quality)
[1] NA
MAE <- function(actual, predicted) {
mean(abs(actual - predicted))
}
MAE(p.rpart, wine_test$quality)
[1] NA
mean(wine_train$quality)
[1] NA
MAE(5.87, wine_test$quality)
[1] NA
install.packages("plyr")
trying URL 'http://rspm/default/__linux__/focal/latest/src/contrib/plyr_1.8.9.tar.gz'
Content type 'application/x-gzip' length 402041 bytes (392 KB)
==================================================
downloaded 392 KB
The downloaded source packages are in
‘/tmp/RtmpjTmTaf/downloaded_packages’
install.packages("Cubist")
trying URL 'http://rspm/default/__linux__/focal/latest/src/contrib/Cubist_0.5.1.tar.gz'
Content type 'application/x-gzip' length 881676 bytes (861 KB)
==================================================
downloaded 861 KB
The downloaded source packages are in
‘/tmp/RtmpjTmTaf/downloaded_packages’
library(Cubist)
m.cubist <- cubist(x = wine_train[-12], y = wine_train$quality)
m.cubist
Call:
cubist.default(x = wine_train[-12], y = wine_train$quality)
Number of samples: 3750
Number of predictors: 11
Number of committees: 1
Number of rules: 7
summary(m.cubist)
Call:
cubist.default(x = wine_train[-12], y = wine_train$quality)
Cubist [Release 2.07 GPL Edition] Mon Feb 9 23:47:30 2026
---------------------------------
Target attribute `outcome'
*** Ignoring cases with unknown or N/A target value
Read 1599 cases (12 attributes) from undefined.data
Model:
Rule 1: [630 cases, mean 5.3, range 3 to 8, est err 0.4]
if
alcohol <= 9.8
then
outcome = 5 - 0.79 volatile.acidity - 0.099 alcohol
+ 0.052 fixed.acidity - 0.31 citric.acid + 0.33 sulphates
+ 0.29 pH - 0.0031 free.sulfur.dioxide
- 0.0007 total.sulfur.dioxide - 0.4 chlorides
Rule 2: [589 cases, mean 5.3, range 3 to 8, est err 0.4]
if
sulphates <= 0.92
alcohol <= 9.8
then
outcome = 5.5 + 1.28 sulphates - 0.9 volatile.acidity - 0.33 citric.acid
+ 0.029 fixed.acidity - 0.033 alcohol
- 0.0008 total.sulfur.dioxide - 0.0023 free.sulfur.dioxide
- 0.4 chlorides - 0.1 pH
Rule 3: [80 cases, mean 5.3, range 3 to 7, est err 0.7]
if
volatile.acidity > 0.31
total.sulfur.dioxide <= 14
sulphates <= 0.63
alcohol > 9.8
then
outcome = 0.5 + 0.549 alcohol - 1.61 volatile.acidity + 0.36 sulphates
- 0.18 pH - 0.0005 total.sulfur.dioxide - 0.07 citric.acid
+ 0.001 free.sulfur.dioxide
Rule 4: [340 cases, mean 5.6, range 4 to 7, est err 0.5]
if
volatile.acidity > 0.31
total.sulfur.dioxide > 14
sulphates <= 0.63
alcohol > 9.8
then
outcome = 5.1 + 2.85 sulphates + 0.19 alcohol - 0.74 citric.acid
- 0.69 volatile.acidity - 0.74 pH
- 0.0027 total.sulfur.dioxide + 0.0013 free.sulfur.dioxide
Rule 5: [407 cases, mean 6.1, range 3 to 8, est err 0.6]
if
volatile.acidity > 0.31
sulphates > 0.63
alcohol > 9.8
then
outcome = 7.6 + 0.309 alcohol - 0.0073 total.sulfur.dioxide - 1.12 pH
- 0.81 volatile.acidity - 0.079 fixed.acidity + 0.22 sulphates
+ 0.002 free.sulfur.dioxide
Rule 6: [71 cases, mean 6.2, range 5 to 8, est err 0.5]
if
volatile.acidity <= 0.31
sulphates <= 0.73
alcohol > 9.8
then
outcome = 131.4 + 4.85 volatile.acidity - 124 density - 1.35 pH
+ 0.056 fixed.acidity + 0.54 sulphates + 0.036 alcohol
+ 0.021 residual.sugar
Rule 7: [85 cases, mean 6.5, range 5 to 8, est err 0.4]
if
volatile.acidity <= 0.31
sulphates > 0.73
then
outcome = 17 + 0.39 alcohol + 0.113 fixed.acidity
+ 0.25 volatile.acidity - 16 density + 0.14 sulphates
Evaluation on training data (1599 cases):
Average |error| 0.4
Relative |error| 0.62
Correlation coefficient 0.62
Attribute usage:
Conds Model
96% 100% alcohol
71% 100% sulphates
45% 100% volatile.acidity
19% 93% total.sulfur.dioxide
96% pH
93% free.sulfur.dioxide
81% fixed.acidity
74% citric.acid
55% chlorides
7% density
3% residual.sugar
Time: 0.0 secs
p.cubist <- predict(m.cubist, wine_test)
summary(p.cubist)
Min. 1st Qu. Median Mean 3rd Qu. Max.
5.865 5.865 5.865 5.865 5.865 5.865
cor(p.cubist, wine_test$quality)
[1] NA
MAE(wine_test$quality, p.cubist)
[1] NA