R Notebook

This is an

launch <- read.csv("challenger2.csv")

b <- cov(launch$temperature, launch$distress_ct) / var(launch$temperature)
b

[1] -0.03364796

cor(launch$temperature, launch$distress_ct)

[1] -0.3359996

r * (sd(launch$distress_ct) / sd(launch$temperature))

[1] -0.05118565

summary(model)


Call:
lm(formula = distress_ct ~ temperature + field_check_pressure + 
    flight_num, data = launch)

Residuals:
     Min       1Q   Median       3Q      Max 
-0.65003 -0.24414 -0.11219  0.01279  1.67530 

Coefficients:
                      Estimate Std. Error t value Pr(>|t|)  
(Intercept)           3.527093   1.307024   2.699   0.0142 *
temperature          -0.051386   0.018341  -2.802   0.0114 *
field_check_pressure  0.001757   0.003402   0.517   0.6115  
flight_num            0.014293   0.035138   0.407   0.6887  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.565 on 19 degrees of freedom
Multiple R-squared:   0.36, Adjusted R-squared:  0.259 
F-statistic: 3.563 on 3 and 19 DF,  p-value: 0.03371

reg <- function(y, x) {
  x <- as.matrix(x)
  x <- cbind(Intercept = 1, x)
  b <- solve(t(x) %*% x) %*% t(x) %*% y
  colnames(b) <- "estimate"
  print(b)
}
str(launch)

'data.frame':   29 obs. of  4 variables:
 $ distress_ct         : int  0 1 0 0 0 0 0 0 1 1 ...
 $ temperature         : int  66 70 69 68 67 72 73 70 57 63 ...
 $ field_check_pressure: int  50 50 50 50 50 50 100 100 200 200 ...
 $ flight_num          : int  1 2 3 4 5 6 7 8 9 10 ...

reg(y = launch$distress_ct, x = launch[2])

               estimate
Intercept    2.81458456
temperature -0.03364796

model <- lm(distress_ct ~ temperature + field_check_pressure + flight_num, data = launch)
model


Call:
lm(formula = distress_ct ~ temperature + field_check_pressure + 
    flight_num, data = launch)

Coefficients:
         (Intercept)           temperature  field_check_pressure            flight_num  
           2.240e+00            -3.124e-02            -2.587e-05             2.762e-02

summary(model)


Call:
lm(formula = distress_ct ~ temperature + field_check_pressure + 
    flight_num, data = launch)

Residuals:
    Min      1Q  Median      3Q     Max 
-1.2744 -0.3335 -0.1657  0.2975  1.5284 

Coefficients:
                       Estimate Std. Error t value Pr(>|t|)  
(Intercept)           2.240e+00  1.267e+00   1.767   0.0894 .
temperature          -3.124e-02  1.787e-02  -1.748   0.0927 .
field_check_pressure -2.587e-05  2.383e-03  -0.011   0.9914  
flight_num            2.762e-02  1.798e-02   1.537   0.1369  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 0.6926 on 25 degrees of freedom
Multiple R-squared:  0.2132,    Adjusted R-squared:  0.1188 
F-statistic: 2.259 on 3 and 25 DF,  p-value: 0.1063

insurance <- read.csv("insurance.csv", stringsAsFactors = TRUE)
str(insurance)

'data.frame':   1338 obs. of  7 variables:
 $ age     : int  19 18 28 33 32 31 46 37 37 60 ...
 $ sex     : Factor w/ 2 levels "female","male": 1 2 2 2 2 1 1 1 2 1 ...
 $ bmi     : num  27.9 33.8 33 22.7 28.9 25.7 33.4 27.7 29.8 25.8 ...
 $ children: int  0 1 3 0 0 0 1 3 2 0 ...
 $ smoker  : Factor w/ 2 levels "no","yes": 2 1 1 1 1 1 1 1 1 1 ...
 $ region  : Factor w/ 4 levels "northeast","northwest",..: 4 3 3 2 2 3 3 2 1 2 ...
 $ expenses: num  16885 1726 4449 21984 3867 ...

summary(insurance$expenses)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1122    4740    9382   13270   16640   63770

hist(insurance$expenses)

table(insurance$region)


northeast northwest southeast southwest 
      324       325       364       325

cor(insurance[c("age", "bmi", "children", "expenses")])

               age        bmi   children   expenses
age      1.0000000 0.10934101 0.04246900 0.29900819
bmi      0.1093410 1.00000000 0.01264471 0.19857626
children 0.0424690 0.01264471 1.00000000 0.06799823
expenses 0.2990082 0.19857626 0.06799823 1.00000000

pairs(insurance[c("age", "bmi", "children", "expenses")])

ins_model <- lm(expenses ~ age + children + bmi + sex + smoker + region,
                data = insurance)
ins_model <- lm(expenses ~ ., data = insurance)
ins_model


Call:
lm(formula = expenses ~ ., data = insurance)

Coefficients:
    (Intercept)              age          sexmale              bmi         children        smokeryes  
       -11941.6            256.8           -131.4            339.3            475.7          23847.5  
regionnorthwest  regionsoutheast  regionsouthwest  
         -352.8          -1035.6           -959.3

summary(ins_model)


Call:
lm(formula = expenses ~ ., data = insurance)

Residuals:
     Min       1Q   Median       3Q      Max 
-11302.7  -2850.9   -979.6   1383.9  29981.7 

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)     -11941.6      987.8 -12.089  < 2e-16 ***
age                256.8       11.9  21.586  < 2e-16 ***
sexmale           -131.3      332.9  -0.395 0.693255    
bmi                339.3       28.6  11.864  < 2e-16 ***
children           475.7      137.8   3.452 0.000574 ***
smokeryes        23847.5      413.1  57.723  < 2e-16 ***
regionnorthwest   -352.8      476.3  -0.741 0.458976    
regionsoutheast  -1035.6      478.7  -2.163 0.030685 *  
regionsouthwest   -959.3      477.9  -2.007 0.044921 *  
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 6062 on 1329 degrees of freedom
Multiple R-squared:  0.7509,    Adjusted R-squared:  0.7494 
F-statistic: 500.9 on 8 and 1329 DF,  p-value: < 2.2e-16

insurance$age2 <- insurance$age^2
insurance$bmi30 <- ifelse(insurance$bmi >= 30, 1, 0)
ins_model2 <- lm(expenses ~ age + age2 + children + bmi + sex +
                   bmi30*smoker + region, data = insurance)
summary(ins_model2)


Call:
lm(formula = expenses ~ age + age2 + children + bmi + sex + bmi30 * 
    smoker + region, data = insurance)

Residuals:
     Min       1Q   Median       3Q      Max 
-17297.1  -1656.0  -1262.7   -727.8  24161.6 

Coefficients:
                  Estimate Std. Error t value Pr(>|t|)    
(Intercept)       139.0053  1363.1359   0.102 0.918792    
age               -32.6181    59.8250  -0.545 0.585690    
age2                3.7307     0.7463   4.999 6.54e-07 ***
children          678.6017   105.8855   6.409 2.03e-10 ***
bmi               119.7715    34.2796   3.494 0.000492 ***
sexmale          -496.7690   244.3713  -2.033 0.042267 *  
bmi30            -997.9355   422.9607  -2.359 0.018449 *  
smokeryes       13404.5952   439.9591  30.468  < 2e-16 ***
regionnorthwest  -279.1661   349.2826  -0.799 0.424285    
regionsoutheast  -828.0345   351.6484  -2.355 0.018682 *  
regionsouthwest -1222.1619   350.5314  -3.487 0.000505 ***
bmi30:smokeryes 19810.1534   604.6769  32.762  < 2e-16 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 4445 on 1326 degrees of freedom
Multiple R-squared:  0.8664,    Adjusted R-squared:  0.8653 
F-statistic: 781.7 on 11 and 1326 DF,  p-value: < 2.2e-16

insurance$pred <- predict(ins_model2, insurance)
cor(insurance$pred, insurance$expenses)

[1] 0.9307999

plot(insurance$pred, insurance$expenses)
abline(a = 0, b = 1, col = "red", lwd = 3, lty = 2)

predict(ins_model2,
        data.frame(age = 30, age2 = 30^2, children = 2,
                   bmi = 30, sex = "male", bmi30 = 1,
                   smoker = "no", region = "northeast"))

       1 
5973.774

predict(ins_model2,
        data.frame(age = 30, age2 = 30^2, children = 0,
                   bmi = 30, sex = "female", bmi30 = 1,
                   smoker = "no", region = "northeast"))

      1 
5113.34

tee <- c(1, 1, 1, 2, 2, 3, 4, 5, 5, 6, 6, 7, 7, 7, 7)
at1 <- c(1, 1, 1, 2, 2, 3, 4, 5, 5)
at2 <- c(6, 6, 7, 7, 7, 7)
bt1 <- c(1, 1, 1, 2, 2, 3, 4)
bt2 <- c(5, 5, 6, 6, 7, 7, 7, 7)
sdr_a <- sd(tee) - (length(at1) / length(tee) * sd(at1) + length(at2) / length(tee) * sd(at2))
sdr_b <- sd(tee) - (length(bt1) / length(tee) * sd(bt1) + length(bt2) / length(tee) * sd(bt2))
sdr_a

[1] 1.202815

sdr_b

[1] 1.392751

wine <- read.csv("winequality-red.csv")
str(wine)

'data.frame':   1599 obs. of  12 variables:
 $ fixed.acidity       : num  7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
 $ volatile.acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
 $ citric.acid         : num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
 $ residual.sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
 $ chlorides           : num  0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
 $ free.sulfur.dioxide : num  11 25 15 17 11 13 15 15 9 17 ...
 $ total.sulfur.dioxide: num  34 67 54 60 34 40 59 21 18 102 ...
 $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
 $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
 $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
 $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
 $ quality             : int  5 5 5 6 5 5 5 7 7 5 ...

hist(wine$quality)

summary(wine)

 fixed.acidity   volatile.acidity  citric.acid    residual.sugar     chlorides      
 Min.   : 4.60   Min.   :0.1200   Min.   :0.000   Min.   : 0.900   Min.   :0.01200  
 1st Qu.: 7.10   1st Qu.:0.3900   1st Qu.:0.090   1st Qu.: 1.900   1st Qu.:0.07000  
 Median : 7.90   Median :0.5200   Median :0.260   Median : 2.200   Median :0.07900  
 Mean   : 8.32   Mean   :0.5278   Mean   :0.271   Mean   : 2.539   Mean   :0.08747  
 3rd Qu.: 9.20   3rd Qu.:0.6400   3rd Qu.:0.420   3rd Qu.: 2.600   3rd Qu.:0.09000  
 Max.   :15.90   Max.   :1.5800   Max.   :1.000   Max.   :15.500   Max.   :0.61100  
 free.sulfur.dioxide total.sulfur.dioxide    density             pH          sulphates     
 Min.   : 1.00       Min.   :  6.00       Min.   :0.9901   Min.   :2.740   Min.   :0.3300  
 1st Qu.: 7.00       1st Qu.: 22.00       1st Qu.:0.9956   1st Qu.:3.210   1st Qu.:0.5500  
 Median :14.00       Median : 38.00       Median :0.9968   Median :3.310   Median :0.6200  
 Mean   :15.87       Mean   : 46.47       Mean   :0.9967   Mean   :3.311   Mean   :0.6581  
 3rd Qu.:21.00       3rd Qu.: 62.00       3rd Qu.:0.9978   3rd Qu.:3.400   3rd Qu.:0.7300  
 Max.   :72.00       Max.   :289.00       Max.   :1.0037   Max.   :4.010   Max.   :2.0000  
    alcohol         quality     
 Min.   : 8.40   Min.   :3.000  
 1st Qu.: 9.50   1st Qu.:5.000  
 Median :10.20   Median :6.000  
 Mean   :10.42   Mean   :5.636  
 3rd Qu.:11.10   3rd Qu.:6.000  
 Max.   :14.90   Max.   :8.000

wine_train <- wine[1:3750, ]
wine_test <- wine[3751:4898, ]
library(rpart)
m.rpart <- rpart(quality ~ ., data = wine_train)
m.rpart

n=1599 (2151 observations deleted due to missingness)

node), split, n, deviance, yval
      * denotes terminal node

 1) root 1599 1042.16500 5.636023  
   2) alcohol< 10.525 983  424.15870 5.366226  
     4) sulphates< 0.575 391  128.09720 5.150895 *
     5) sulphates>=0.575 592  265.95780 5.508446  
      10) volatile.acidity>=0.405 448  175.87280 5.404018 *
      11) volatile.acidity< 0.405 144   70.00000 5.833333 *
   3) alcohol>=10.525 616  432.27110 6.066558  
     6) sulphates< 0.645 272  191.86760 5.727941  
      12) volatile.acidity>=1.015 10    6.00000 4.000000 *
      13) volatile.acidity< 1.015 262  154.87020 5.793893  
        26) volatile.acidity>=0.495 146   73.67123 5.575342 *
        27) volatile.acidity< 0.495 116   65.44828 6.068966 *
     7) sulphates>=0.645 344  184.55520 6.334302  
      14) alcohol< 11.55 206  101.96600 6.121359  
        28) volatile.acidity>=0.395 111   37.42342 5.927928 *
        29) volatile.acidity< 0.395 95   55.53684 6.347368  
          58) pH>=3.255 59   29.72881 6.067797 *
          59) pH< 3.255 36   13.63889 6.805556 *
      15) alcohol>=11.55 138   59.30435 6.652174 *

summary(m.rpart)

Call:
rpart(formula = quality ~ ., data = wine_train)
  n=1599 (2151 observations deleted due to missingness)

          CP nsplit rel error    xerror       xstd
1 0.17822061      0 1.0000000 1.0017513 0.03790139
2 0.05358865      1 0.8217794 0.8385585 0.03678114
3 0.02974329      2 0.7681907 0.7940327 0.03360605
4 0.02888577      3 0.7384474 0.7939605 0.03300086
5 0.02234278      4 0.7095617 0.7736573 0.03211681
6 0.01927238      5 0.6872189 0.7511204 0.03070159
7 0.01511346      6 0.6679465 0.7318769 0.02955708
8 0.01015909      7 0.6528331 0.7249689 0.02925140
9 0.01000000      9 0.6325149 0.7140139 0.02911207

Variable importance
             alcohol     volatile.acidity              density            sulphates 
                  31                   17                   13                   13 
       fixed.acidity            chlorides          citric.acid                   pH 
                   7                    6                    5                    4 
total.sulfur.dioxide  free.sulfur.dioxide 
                   3                    1 

Node number 1: 1599 observations,    complexity param=0.1782206
  mean=5.636023, MSE=0.6517605 
  left son=2 (983 obs) right son=3 (616 obs)
  Primary splits:
      alcohol          < 10.525   to the left,  improve=0.17822060, (0 missing)
      sulphates        < 0.645    to the left,  improve=0.12565160, (0 missing)
      volatile.acidity < 0.425    to the right, improve=0.11400620, (0 missing)
      citric.acid      < 0.295    to the left,  improve=0.07225368, (0 missing)
      density          < 0.99539  to the right, improve=0.06402980, (0 missing)
  Surrogate splits:
      density              < 0.995575 to the right, agree=0.762, adj=0.383, (0 split)
      chlorides            < 0.0685   to the right, agree=0.690, adj=0.195, (0 split)
      volatile.acidity     < 0.3675   to the right, agree=0.662, adj=0.123, (0 split)
      fixed.acidity        < 6.75     to the right, agree=0.654, adj=0.101, (0 split)
      total.sulfur.dioxide < 17.5     to the right, agree=0.641, adj=0.068, (0 split)

Node number 2: 983 observations,    complexity param=0.02888577
  mean=5.366226, MSE=0.4314941 
  left son=4 (391 obs) right son=5 (592 obs)
  Primary splits:
      sulphates            < 0.575    to the left,  improve=0.07097282, (0 missing)
      volatile.acidity     < 0.335    to the right, improve=0.06388554, (0 missing)
      alcohol              < 9.85     to the left,  improve=0.05212216, (0 missing)
      fixed.acidity        < 10.85    to the left,  improve=0.03084011, (0 missing)
      total.sulfur.dioxide < 83.5     to the right, improve=0.02749674, (0 missing)
  Surrogate splits:
      density              < 0.996225 to the left,  agree=0.662, adj=0.151, (0 split)
      volatile.acidity     < 0.6525   to the right, agree=0.636, adj=0.084, (0 split)
      fixed.acidity        < 6.05     to the left,  agree=0.609, adj=0.018, (0 split)
      citric.acid          < 0.115    to the left,  agree=0.609, adj=0.018, (0 split)
      total.sulfur.dioxide < 9.5      to the left,  agree=0.608, adj=0.015, (0 split)

Node number 3: 616 observations,    complexity param=0.05358865
  mean=6.066558, MSE=0.7017388 
  left son=6 (272 obs) right son=7 (344 obs)
  Primary splits:
      sulphates        < 0.645    to the left,  improve=0.12919720, (0 missing)
      volatile.acidity < 0.87     to the right, improve=0.11482610, (0 missing)
      citric.acid      < 0.295    to the left,  improve=0.10819510, (0 missing)
      alcohol          < 11.55    to the left,  improve=0.10309310, (0 missing)
      pH               < 3.355    to the right, improve=0.07557599, (0 missing)
  Surrogate splits:
      citric.acid      < 0.245    to the left,  agree=0.683, adj=0.283, (0 split)
      fixed.acidity    < 7.85     to the left,  agree=0.666, adj=0.243, (0 split)
      volatile.acidity < 0.5875   to the right, agree=0.653, adj=0.213, (0 split)
      density          < 0.994915 to the left,  agree=0.635, adj=0.173, (0 split)
      pH               < 3.405    to the right, agree=0.630, adj=0.162, (0 split)

Node number 4: 391 observations
  mean=5.150895, MSE=0.3276143 

Node number 5: 592 observations,    complexity param=0.01927238
  mean=5.508446, MSE=0.449253 
  left son=10 (448 obs) right son=11 (144 obs)
  Primary splits:
      volatile.acidity     < 0.405    to the right, improve=0.07551952, (0 missing)
      total.sulfur.dioxide < 81.5     to the right, improve=0.05845854, (0 missing)
      alcohol              < 9.85     to the left,  improve=0.05386312, (0 missing)
      fixed.acidity        < 10.95    to the left,  improve=0.05335172, (0 missing)
      chlorides            < 0.0975   to the right, improve=0.03262428, (0 missing)
  Surrogate splits:
      fixed.acidity       < 10.45    to the left,  agree=0.787, adj=0.125, (0 split)
      chlorides           < 0.0565   to the right, agree=0.765, adj=0.035, (0 split)
      citric.acid         < 0.365    to the left,  agree=0.764, adj=0.028, (0 split)
      free.sulfur.dioxide < 2.5      to the right, agree=0.764, adj=0.028, (0 split)
      alcohol             < 8.6      to the right, agree=0.758, adj=0.007, (0 split)

Node number 6: 272 observations,    complexity param=0.02974329
  mean=5.727941, MSE=0.7053958 
  left son=12 (10 obs) right son=13 (262 obs)
  Primary splits:
      volatile.acidity < 1.015    to the right, improve=0.16155630, (0 missing)
      alcohol          < 11.45    to the left,  improve=0.11901850, (0 missing)
      citric.acid      < 0.255    to the left,  improve=0.11313180, (0 missing)
      pH               < 3.365    to the right, improve=0.09055459, (0 missing)
      sulphates        < 0.585    to the left,  improve=0.04970438, (0 missing)

Node number 7: 344 observations,    complexity param=0.02234278
  mean=6.334302, MSE=0.5364978 
  left son=14 (206 obs) right son=15 (138 obs)
  Primary splits:
      alcohol              < 11.55    to the left,  improve=0.12616750, (0 missing)
      chlorides            < 0.0785   to the right, improve=0.05765389, (0 missing)
      total.sulfur.dioxide < 101.5    to the right, improve=0.05496021, (0 missing)
      density              < 0.99537  to the right, improve=0.04412990, (0 missing)
      volatile.acidity     < 0.425    to the right, improve=0.04136603, (0 missing)
  Surrogate splits:
      density        < 0.994875 to the right, agree=0.701, adj=0.254, (0 split)
      chlorides      < 0.053    to the right, agree=0.651, adj=0.130, (0 split)
      fixed.acidity  < 5.55     to the right, agree=0.640, adj=0.101, (0 split)
      residual.sugar < 4.25     to the left,  agree=0.628, adj=0.072, (0 split)
      citric.acid    < 0.635    to the left,  agree=0.622, adj=0.058, (0 split)

Node number 10: 448 observations
  mean=5.404018, MSE=0.3925731 

Node number 11: 144 observations
  mean=5.833333, MSE=0.4861111 

Node number 12: 10 observations
  mean=4, MSE=0.6 

Node number 13: 262 observations,    complexity param=0.01511346
  mean=5.793893, MSE=0.5911077 
  left son=26 (146 obs) right son=27 (116 obs)
  Primary splits:
      volatile.acidity < 0.495    to the right, improve=0.10170270, (0 missing)
      alcohol          < 11.45    to the left,  improve=0.09838534, (0 missing)
      citric.acid      < 0.255    to the left,  improve=0.09415346, (0 missing)
      pH               < 3.295    to the right, improve=0.07618253, (0 missing)
      density          < 0.995155 to the right, improve=0.05214905, (0 missing)
  Surrogate splits:
      citric.acid          < 0.235    to the left,  agree=0.866, adj=0.698, (0 split)
      pH                   < 3.305    to the right, agree=0.733, adj=0.397, (0 split)
      fixed.acidity        < 7.85     to the left,  agree=0.691, adj=0.302, (0 split)
      alcohol              < 11.85    to the left,  agree=0.641, adj=0.190, (0 split)
      total.sulfur.dioxide < 12.5     to the right, agree=0.637, adj=0.181, (0 split)

Node number 14: 206 observations,    complexity param=0.01015909
  mean=6.121359, MSE=0.4949807 
  left son=28 (111 obs) right son=29 (95 obs)
  Primary splits:
      volatile.acidity     < 0.395    to the right, improve=0.08832113, (0 missing)
      total.sulfur.dioxide < 49.5     to the right, improve=0.06808035, (0 missing)
      chlorides            < 0.0945   to the right, improve=0.05079896, (0 missing)
      citric.acid          < 0.295    to the left,  improve=0.05051307, (0 missing)
      free.sulfur.dioxide  < 25.5     to the right, improve=0.03611908, (0 missing)
  Surrogate splits:
      citric.acid    < 0.285    to the left,  agree=0.733, adj=0.421, (0 split)
      sulphates      < 0.765    to the left,  agree=0.655, adj=0.253, (0 split)
      chlorides      < 0.0675   to the right, agree=0.617, adj=0.168, (0 split)
      residual.sugar < 1.85     to the right, agree=0.612, adj=0.158, (0 split)
      fixed.acidity  < 7.05     to the left,  agree=0.597, adj=0.126, (0 split)

Node number 15: 138 observations
  mean=6.652174, MSE=0.4297417 

Node number 26: 146 observations
  mean=5.575342, MSE=0.5045975 

Node number 27: 116 observations
  mean=6.068966, MSE=0.5642093 

Node number 28: 111 observations
  mean=5.927928, MSE=0.337148 

Node number 29: 95 observations,    complexity param=0.01015909
  mean=6.347368, MSE=0.5845983 
  left son=58 (59 obs) right son=59 (36 obs)
  Primary splits:
      pH                   < 3.255    to the right, improve=0.21911830, (0 missing)
      total.sulfur.dioxide < 56.5     to the right, improve=0.18528400, (0 missing)
      fixed.acidity        < 10       to the left,  improve=0.12899290, (0 missing)
      free.sulfur.dioxide  < 24.5     to the right, improve=0.11666000, (0 missing)
      alcohol              < 10.75    to the left,  improve=0.05498168, (0 missing)
  Surrogate splits:
      fixed.acidity        < 9.7      to the left,  agree=0.737, adj=0.306, (0 split)
      total.sulfur.dioxide < 28.5     to the right, agree=0.737, adj=0.306, (0 split)
      free.sulfur.dioxide  < 9.5      to the right, agree=0.716, adj=0.250, (0 split)
      chlorides            < 0.0635   to the right, agree=0.663, adj=0.111, (0 split)
      sulphates            < 0.935    to the left,  agree=0.663, adj=0.111, (0 split)

Node number 58: 59 observations
  mean=6.067797, MSE=0.5038782 

Node number 59: 36 observations
  mean=6.805556, MSE=0.378858

install.packages("rpart.plot")

trying URL 'http://rspm/default/__linux__/focal/latest/src/contrib/rpart.plot_3.1.3.tar.gz'
Content type 'application/x-gzip' length 1014419 bytes (990 KB)
==================================================
downloaded 990 KB


The downloaded source packages are in
    ‘/tmp/Rtmpom6lTf/downloaded_packages’

library(rpart.plot)

rpart.plot(m.rpart, digits = 3)

rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101)

p.rpart <- predict(m.rpart, wine_test)
summary(p.rpart)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  5.404   5.404   5.404   5.404   5.404   5.404

summary(wine_test$quality)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
     NA      NA      NA     NaN      NA      NA    1148

cor(p.rpart, wine_test$quality)

[1] NA

MAE <- function(actual, predicted) {
  mean(abs(actual - predicted))  
}

MAE(p.rpart, wine_test$quality)

[1] NA

mean(wine_train$quality)

[1] NA

MAE(5.87, wine_test$quality)

[1] NA

install.packages("plyr")

trying URL 'http://rspm/default/__linux__/focal/latest/src/contrib/plyr_1.8.9.tar.gz'
Content type 'application/x-gzip' length 402041 bytes (392 KB)
==================================================
downloaded 392 KB


The downloaded source packages are in
    ‘/tmp/RtmpjTmTaf/downloaded_packages’

install.packages("Cubist")

trying URL 'http://rspm/default/__linux__/focal/latest/src/contrib/Cubist_0.5.1.tar.gz'
Content type 'application/x-gzip' length 881676 bytes (861 KB)
==================================================
downloaded 861 KB


The downloaded source packages are in
    ‘/tmp/RtmpjTmTaf/downloaded_packages’

library(Cubist)

m.cubist <- cubist(x = wine_train[-12], y = wine_train$quality)
m.cubist


Call:
cubist.default(x = wine_train[-12], y = wine_train$quality)

Number of samples: 3750 
Number of predictors: 11 

Number of committees: 1 
Number of rules: 7

summary(m.cubist)


Call:
cubist.default(x = wine_train[-12], y = wine_train$quality)


Cubist [Release 2.07 GPL Edition]  Mon Feb  9 23:47:30 2026
---------------------------------

    Target attribute `outcome'

*** Ignoring cases with unknown or N/A target value

Read 1599 cases (12 attributes) from undefined.data

Model:

  Rule 1: [630 cases, mean 5.3, range 3 to 8, est err 0.4]

    if
    alcohol <= 9.8
    then
    outcome = 5 - 0.79 volatile.acidity - 0.099 alcohol
              + 0.052 fixed.acidity - 0.31 citric.acid + 0.33 sulphates
              + 0.29 pH - 0.0031 free.sulfur.dioxide
              - 0.0007 total.sulfur.dioxide - 0.4 chlorides

  Rule 2: [589 cases, mean 5.3, range 3 to 8, est err 0.4]

    if
    sulphates <= 0.92
    alcohol <= 9.8
    then
    outcome = 5.5 + 1.28 sulphates - 0.9 volatile.acidity - 0.33 citric.acid
              + 0.029 fixed.acidity - 0.033 alcohol
              - 0.0008 total.sulfur.dioxide - 0.0023 free.sulfur.dioxide
              - 0.4 chlorides - 0.1 pH

  Rule 3: [80 cases, mean 5.3, range 3 to 7, est err 0.7]

    if
    volatile.acidity > 0.31
    total.sulfur.dioxide <= 14
    sulphates <= 0.63
    alcohol > 9.8
    then
    outcome = 0.5 + 0.549 alcohol - 1.61 volatile.acidity + 0.36 sulphates
              - 0.18 pH - 0.0005 total.sulfur.dioxide - 0.07 citric.acid
              + 0.001 free.sulfur.dioxide

  Rule 4: [340 cases, mean 5.6, range 4 to 7, est err 0.5]

    if
    volatile.acidity > 0.31
    total.sulfur.dioxide > 14
    sulphates <= 0.63
    alcohol > 9.8
    then
    outcome = 5.1 + 2.85 sulphates + 0.19 alcohol - 0.74 citric.acid
              - 0.69 volatile.acidity - 0.74 pH
              - 0.0027 total.sulfur.dioxide + 0.0013 free.sulfur.dioxide

  Rule 5: [407 cases, mean 6.1, range 3 to 8, est err 0.6]

    if
    volatile.acidity > 0.31
    sulphates > 0.63
    alcohol > 9.8
    then
    outcome = 7.6 + 0.309 alcohol - 0.0073 total.sulfur.dioxide - 1.12 pH
              - 0.81 volatile.acidity - 0.079 fixed.acidity + 0.22 sulphates
              + 0.002 free.sulfur.dioxide

  Rule 6: [71 cases, mean 6.2, range 5 to 8, est err 0.5]

    if
    volatile.acidity <= 0.31
    sulphates <= 0.73
    alcohol > 9.8
    then
    outcome = 131.4 + 4.85 volatile.acidity - 124 density - 1.35 pH
              + 0.056 fixed.acidity + 0.54 sulphates + 0.036 alcohol
              + 0.021 residual.sugar

  Rule 7: [85 cases, mean 6.5, range 5 to 8, est err 0.4]

    if
    volatile.acidity <= 0.31
    sulphates > 0.73
    then
    outcome = 17 + 0.39 alcohol + 0.113 fixed.acidity
              + 0.25 volatile.acidity - 16 density + 0.14 sulphates


Evaluation on training data (1599 cases):

    Average  |error|                0.4
    Relative |error|               0.62
    Correlation coefficient        0.62


    Attribute usage:
      Conds  Model

       96%   100%    alcohol
       71%   100%    sulphates
       45%   100%    volatile.acidity
       19%    93%    total.sulfur.dioxide
              96%    pH
              93%    free.sulfur.dioxide
              81%    fixed.acidity
              74%    citric.acid
              55%    chlorides
               7%    density
               3%    residual.sugar


Time: 0.0 secs

p.cubist <- predict(m.cubist, wine_test)

summary(p.cubist)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  5.865   5.865   5.865   5.865   5.865   5.865

cor(p.cubist, wine_test$quality)

[1] NA

MAE(wine_test$quality, p.cubist)

[1] NA

LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKVGhpcyBpcyBhbiAKYGBge3J9CmxhdW5jaCA8LSByZWFkLmNzdigiY2hhbGxlbmdlcjIuY3N2IikKYGBgCmBgYHtyfQpiIDwtIGNvdihsYXVuY2gkdGVtcGVyYXR1cmUsIGxhdW5jaCRkaXN0cmVzc19jdCkgLyB2YXIobGF1bmNoJHRlbXBlcmF0dXJlKQpiCmBgYApgYGB7cn0KY29yKGxhdW5jaCR0ZW1wZXJhdHVyZSwgbGF1bmNoJGRpc3RyZXNzX2N0KQpgYGAKYGBge3J9CnIgKiAoc2QobGF1bmNoJGRpc3RyZXNzX2N0KSAvIHNkKGxhdW5jaCR0ZW1wZXJhdHVyZSkpCmBgYApgYGB7cn0Kc3VtbWFyeShtb2RlbCkKYGBgCmBgYHtyfQpyZWcgPC0gZnVuY3Rpb24oeSwgeCkgewogIHggPC0gYXMubWF0cml4KHgpCiAgeCA8LSBjYmluZChJbnRlcmNlcHQgPSAxLCB4KQogIGIgPC0gc29sdmUodCh4KSAlKiUgeCkgJSolIHQoeCkgJSolIHkKICBjb2xuYW1lcyhiKSA8LSAiZXN0aW1hdGUiCiAgcHJpbnQoYikKfQpzdHIobGF1bmNoKQpgYGAKYGBge3J9CnJlZyh5ID0gbGF1bmNoJGRpc3RyZXNzX2N0LCB4ID0gbGF1bmNoWzJdKQpgYGAKYGBge3J9Cm1vZGVsIDwtIGxtKGRpc3RyZXNzX2N0IH4gdGVtcGVyYXR1cmUgKyBmaWVsZF9jaGVja19wcmVzc3VyZSArIGZsaWdodF9udW0sIGRhdGEgPSBsYXVuY2gpCm1vZGVsCmBgYApgYGB7cn0Kc3VtbWFyeShtb2RlbCkKYGBgCmBgYHtyfQppbnN1cmFuY2UgPC0gcmVhZC5jc3YoImluc3VyYW5jZS5jc3YiLCBzdHJpbmdzQXNGYWN0b3JzID0gVFJVRSkKc3RyKGluc3VyYW5jZSkKYGBgCmBgYHtyfQpzdW1tYXJ5KGluc3VyYW5jZSRleHBlbnNlcykKYGBgCmBgYHtyfQpoaXN0KGluc3VyYW5jZSRleHBlbnNlcykKYGBgCmBgYHtyfQp0YWJsZShpbnN1cmFuY2UkcmVnaW9uKQpgYGAKYGBge3J9CmNvcihpbnN1cmFuY2VbYygiYWdlIiwgImJtaSIsICJjaGlsZHJlbiIsICJleHBlbnNlcyIpXSkKYGBgCmBgYHtyfQpwYWlycyhpbnN1cmFuY2VbYygiYWdlIiwgImJtaSIsICJjaGlsZHJlbiIsICJleHBlbnNlcyIpXSkKYGBgCmBgYHtyfQppbnNfbW9kZWwgPC0gbG0oZXhwZW5zZXMgfiBhZ2UgKyBjaGlsZHJlbiArIGJtaSArIHNleCArIHNtb2tlciArIHJlZ2lvbiwKICAgICAgICAgICAgICAgIGRhdGEgPSBpbnN1cmFuY2UpCmluc19tb2RlbCA8LSBsbShleHBlbnNlcyB+IC4sIGRhdGEgPSBpbnN1cmFuY2UpCmluc19tb2RlbApgYGAKYGBge3J9CnN1bW1hcnkoaW5zX21vZGVsKQpgYGAKYGBge3J9Cmluc3VyYW5jZSRhZ2UyIDwtIGluc3VyYW5jZSRhZ2VeMgppbnN1cmFuY2UkYm1pMzAgPC0gaWZlbHNlKGluc3VyYW5jZSRibWkgPj0gMzAsIDEsIDApCmluc19tb2RlbDIgPC0gbG0oZXhwZW5zZXMgfiBhZ2UgKyBhZ2UyICsgY2hpbGRyZW4gKyBibWkgKyBzZXggKwogICAgICAgICAgICAgICAgICAgYm1pMzAqc21va2VyICsgcmVnaW9uLCBkYXRhID0gaW5zdXJhbmNlKQpzdW1tYXJ5KGluc19tb2RlbDIpCmBgYApgYGB7cn0KaW5zdXJhbmNlJHByZWQgPC0gcHJlZGljdChpbnNfbW9kZWwyLCBpbnN1cmFuY2UpCmNvcihpbnN1cmFuY2UkcHJlZCwgaW5zdXJhbmNlJGV4cGVuc2VzKQpgYGAKYGBge3J9CnBsb3QoaW5zdXJhbmNlJHByZWQsIGluc3VyYW5jZSRleHBlbnNlcykKYWJsaW5lKGEgPSAwLCBiID0gMSwgY29sID0gInJlZCIsIGx3ZCA9IDMsIGx0eSA9IDIpCmBgYApgYGB7cn0KcHJlZGljdChpbnNfbW9kZWwyLAogICAgICAgIGRhdGEuZnJhbWUoYWdlID0gMzAsIGFnZTIgPSAzMF4yLCBjaGlsZHJlbiA9IDIsCiAgICAgICAgICAgICAgICAgICBibWkgPSAzMCwgc2V4ID0gIm1hbGUiLCBibWkzMCA9IDEsCiAgICAgICAgICAgICAgICAgICBzbW9rZXIgPSAibm8iLCByZWdpb24gPSAibm9ydGhlYXN0IikpCmBgYApgYGB7cn0KcHJlZGljdChpbnNfbW9kZWwyLAogICAgICAgIGRhdGEuZnJhbWUoYWdlID0gMzAsIGFnZTIgPSAzMF4yLCBjaGlsZHJlbiA9IDAsCiAgICAgICAgICAgICAgICAgICBibWkgPSAzMCwgc2V4ID0gImZlbWFsZSIsIGJtaTMwID0gMSwKICAgICAgICAgICAgICAgICAgIHNtb2tlciA9ICJubyIsIHJlZ2lvbiA9ICJub3J0aGVhc3QiKSkKYGBgCmBgYHtyfQp0ZWUgPC0gYygxLCAxLCAxLCAyLCAyLCAzLCA0LCA1LCA1LCA2LCA2LCA3LCA3LCA3LCA3KQphdDEgPC0gYygxLCAxLCAxLCAyLCAyLCAzLCA0LCA1LCA1KQphdDIgPC0gYyg2LCA2LCA3LCA3LCA3LCA3KQpidDEgPC0gYygxLCAxLCAxLCAyLCAyLCAzLCA0KQpidDIgPC0gYyg1LCA1LCA2LCA2LCA3LCA3LCA3LCA3KQpzZHJfYSA8LSBzZCh0ZWUpIC0gKGxlbmd0aChhdDEpIC8gbGVuZ3RoKHRlZSkgKiBzZChhdDEpICsgbGVuZ3RoKGF0MikgLyBsZW5ndGgodGVlKSAqIHNkKGF0MikpCnNkcl9iIDwtIHNkKHRlZSkgLSAobGVuZ3RoKGJ0MSkgLyBsZW5ndGgodGVlKSAqIHNkKGJ0MSkgKyBsZW5ndGgoYnQyKSAvIGxlbmd0aCh0ZWUpICogc2QoYnQyKSkKc2RyX2EKYGBgCmBgYHtyfQpzZHJfYgpgYGAKYGBge3J9CndpbmUgPC0gcmVhZC5jc3YoIndpbmVxdWFsaXR5LXJlZC5jc3YiKQpzdHIod2luZSkKYGBgCmBgYHtyfQpoaXN0KHdpbmUkcXVhbGl0eSkKYGBgCmBgYHtyfQpzdW1tYXJ5KHdpbmUpCmBgYApgYGB7cn0Kd2luZV90cmFpbiA8LSB3aW5lWzE6Mzc1MCwgXQp3aW5lX3Rlc3QgPC0gd2luZVszNzUxOjQ4OTgsIF0KbGlicmFyeShycGFydCkKbS5ycGFydCA8LSBycGFydChxdWFsaXR5IH4gLiwgZGF0YSA9IHdpbmVfdHJhaW4pCm0ucnBhcnQKYGBgCmBgYHtyfQpzdW1tYXJ5KG0ucnBhcnQpCmBgYApgYGB7cn0KaW5zdGFsbC5wYWNrYWdlcygicnBhcnQucGxvdCIpCmxpYnJhcnkocnBhcnQucGxvdCkKYGBgCmBgYHtyfQpycGFydC5wbG90KG0ucnBhcnQsIGRpZ2l0cyA9IDMpCmBgYApgYGB7cn0KcnBhcnQucGxvdChtLnJwYXJ0LCBkaWdpdHMgPSA0LCBmYWxsZW4ubGVhdmVzID0gVFJVRSwgdHlwZSA9IDMsIGV4dHJhID0gMTAxKQpgYGAKYGBge3J9CnAucnBhcnQgPC0gcHJlZGljdChtLnJwYXJ0LCB3aW5lX3Rlc3QpCnN1bW1hcnkocC5ycGFydCkKYGBgCmBgYHtyfQpzdW1tYXJ5KHdpbmVfdGVzdCRxdWFsaXR5KQpgYGAKYGBge3J9CmNvcihwLnJwYXJ0LCB3aW5lX3Rlc3QkcXVhbGl0eSkKYGBgCmBgYHtyfQpNQUUgPC0gZnVuY3Rpb24oYWN0dWFsLCBwcmVkaWN0ZWQpIHsKICBtZWFuKGFicyhhY3R1YWwgLSBwcmVkaWN0ZWQpKSAgCn0KCmBgYApgYGB7cn0KTUFFKHAucnBhcnQsIHdpbmVfdGVzdCRxdWFsaXR5KQpgYGAKYGBge3J9Cm1lYW4od2luZV90cmFpbiRxdWFsaXR5KQpgYGAKYGBge3J9Ck1BRSg1Ljg3LCB3aW5lX3Rlc3QkcXVhbGl0eSkKYGBgCmBgYHtyfQppbnN0YWxsLnBhY2thZ2VzKCJwbHlyIikKaW5zdGFsbC5wYWNrYWdlcygiQ3ViaXN0IikKbGlicmFyeShDdWJpc3QpCmBgYApgYGB7cn0KbS5jdWJpc3QgPC0gY3ViaXN0KHggPSB3aW5lX3RyYWluWy0xMl0sIHkgPSB3aW5lX3RyYWluJHF1YWxpdHkpCm0uY3ViaXN0CmBgYApgYGB7cn0Kc3VtbWFyeShtLmN1YmlzdCkKYGBgCmBgYHtyfQpwLmN1YmlzdCA8LSBwcmVkaWN0KG0uY3ViaXN0LCB3aW5lX3Rlc3QpCmBgYApgYGB7cn0Kc3VtbWFyeShwLmN1YmlzdCkKYGBgCmBgYHtyfQpjb3IocC5jdWJpc3QsIHdpbmVfdGVzdCRxdWFsaXR5KQpgYGAKYGBge3J9Ck1BRSh3aW5lX3Rlc3QkcXVhbGl0eSwgcC5jdWJpc3QpIApgYGAKCgo=