1

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

packageurl <- "http://cran.r-project.org/src/contrib/Archive/knitr/knitr_1.12.tar.gz"
install.packages(packageurl, repos=NULL, type="source")

## Installing package into 'C:/Users/koval/Documents/R/win-library/3.2'
## (as 'lib' is unspecified)

wine <- read.csv ("C:/whitewines.csv") #загрузка данных в R
str (wine) #отображение внутренней структуры объекта wine

## 'data.frame':    4898 obs. of  12 variables:
##  $ fixed.acidity       : num  7 6.3 8.1 7.2 7.2 8.1 6.2 7 6.3 8.1 ...
##  $ volatile.acidity    : num  0.27 0.3 0.28 0.23 0.23 0.28 0.32 0.27 0.3 0.22 ...
##  $ citric.acid         : num  0.36 0.34 0.4 0.32 0.32 0.4 0.16 0.36 0.34 0.43 ...
##  $ residual.sugar      : num  20.7 1.6 6.9 8.5 8.5 6.9 7 20.7 1.6 1.5 ...
##  $ chlorides           : num  0.045 0.049 0.05 0.058 0.058 0.05 0.045 0.045 0.049 0.044 ...
##  $ free.sulfur.dioxide : num  45 14 30 47 47 30 30 45 14 28 ...
##  $ total.sulfur.dioxide: num  170 132 97 186 186 97 136 170 132 129 ...
##  $ density             : num  1.001 0.994 0.995 0.996 0.996 ...
##  $ pH                  : num  3 3.3 3.26 3.19 3.19 3.26 3.18 3 3.3 3.22 ...
##  $ sulphates           : num  0.45 0.49 0.44 0.4 0.4 0.44 0.47 0.45 0.49 0.45 ...
##  $ alcohol             : num  8.8 9.5 10.1 9.9 9.9 10.1 9.6 8.8 9.5 11 ...
##  $ quality             : int  6 6 6 6 6 6 6 6 6 6 ...

hist(wine$quality) #строим гистограмму для оценок дегустаторов

wine_train <- wine[1:3750, ] #формируем данные для обучения путём деления исходных данных
wine_test <- wine[3751:4898, ] #формируем данные для тестирования путём деления исходных данных

#3
library (rpart) # подключаем библиотеку rpart

## Warning: package 'rpart' was built under R version 3.2.5

m.rpart<-rpart (quality ~ ., data = wine_train) #создаём модель m.rpart с quality как переменной результата и данными из wine_train
m.rpart #смотрим базовую информацию о дереве

## n= 3750 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 3750 3140.06000 5.886933  
##    2) alcohol< 10.85 2473 1510.66200 5.609381  
##      4) volatile.acidity>=0.2425 1406  740.15080 5.402560  
##        8) volatile.acidity>=0.4225 182   92.99451 4.994505 *
##        9) volatile.acidity< 0.4225 1224  612.34560 5.463235 *
##      5) volatile.acidity< 0.2425 1067  631.12090 5.881912 *
##    3) alcohol>=10.85 1277 1069.95800 6.424432  
##      6) free.sulfur.dioxide< 11.5 93   99.18280 5.473118 *
##      7) free.sulfur.dioxide>=11.5 1184  879.99920 6.499155  
##       14) alcohol< 11.85 611  447.38130 6.296236 *
##       15) alcohol>=11.85 573  380.63180 6.715532 *

summary (m.rpart) #смотрим подробную информацию о дереве

## Call:
## rpart(formula = quality ~ ., data = wine_train)
##   n= 3750 
## 
##           CP nsplit rel error    xerror       xstd
## 1 0.17816211      0 1.0000000 1.0004571 0.02388743
## 2 0.04439109      1 0.8218379 0.8228358 0.02238039
## 3 0.02890893      2 0.7774468 0.7851283 0.02202864
## 4 0.01655575      3 0.7485379 0.7582200 0.02088449
## 5 0.01108600      4 0.7319821 0.7466331 0.02050013
## 6 0.01000000      5 0.7208961 0.7437313 0.02042172
## 
## Variable importance
##              alcohol              density            chlorides 
##                   38                   23                   12 
##     volatile.acidity total.sulfur.dioxide  free.sulfur.dioxide 
##                   12                    7                    6 
##            sulphates                   pH       residual.sugar 
##                    1                    1                    1 
## 
## Node number 1: 3750 observations,    complexity param=0.1781621
##   mean=5.886933, MSE=0.8373493 
##   left son=2 (2473 obs) right son=3 (1277 obs)
##   Primary splits:
##       alcohol              < 10.85    to the left,  improve=0.17816210, (0 missing)
##       density              < 0.992385 to the right, improve=0.11980970, (0 missing)
##       chlorides            < 0.0395   to the right, improve=0.08199995, (0 missing)
##       total.sulfur.dioxide < 153.5    to the right, improve=0.03875440, (0 missing)
##       free.sulfur.dioxide  < 11.75    to the left,  improve=0.03632119, (0 missing)
##   Surrogate splits:
##       density              < 0.99201  to the right, agree=0.869, adj=0.614, (0 split)
##       chlorides            < 0.0375   to the right, agree=0.773, adj=0.334, (0 split)
##       total.sulfur.dioxide < 102.5    to the right, agree=0.705, adj=0.132, (0 split)
##       sulphates            < 0.345    to the right, agree=0.670, adj=0.031, (0 split)
##       fixed.acidity        < 5.25     to the right, agree=0.662, adj=0.009, (0 split)
## 
## Node number 2: 2473 observations,    complexity param=0.04439109
##   mean=5.609381, MSE=0.6108623 
##   left son=4 (1406 obs) right son=5 (1067 obs)
##   Primary splits:
##       volatile.acidity    < 0.2425   to the right, improve=0.09227123, (0 missing)
##       free.sulfur.dioxide < 13.5     to the left,  improve=0.04177240, (0 missing)
##       alcohol             < 10.15    to the left,  improve=0.03313802, (0 missing)
##       citric.acid         < 0.205    to the left,  improve=0.02721200, (0 missing)
##       pH                  < 3.325    to the left,  improve=0.01860335, (0 missing)
##   Surrogate splits:
##       total.sulfur.dioxide < 111.5    to the right, agree=0.610, adj=0.097, (0 split)
##       pH                   < 3.295    to the left,  agree=0.598, adj=0.067, (0 split)
##       alcohol              < 10.05    to the left,  agree=0.590, adj=0.049, (0 split)
##       sulphates            < 0.715    to the left,  agree=0.584, adj=0.037, (0 split)
##       residual.sugar       < 1.85     to the right, agree=0.581, adj=0.029, (0 split)
## 
## Node number 3: 1277 observations,    complexity param=0.02890893
##   mean=6.424432, MSE=0.8378682 
##   left son=6 (93 obs) right son=7 (1184 obs)
##   Primary splits:
##       free.sulfur.dioxide  < 11.5     to the left,  improve=0.08484051, (0 missing)
##       alcohol              < 11.85    to the left,  improve=0.06149941, (0 missing)
##       fixed.acidity        < 7.35     to the right, improve=0.04259695, (0 missing)
##       residual.sugar       < 1.275    to the left,  improve=0.02795662, (0 missing)
##       total.sulfur.dioxide < 67.5     to the left,  improve=0.02541719, (0 missing)
##   Surrogate splits:
##       total.sulfur.dioxide < 48.5     to the left,  agree=0.937, adj=0.14, (0 split)
## 
## Node number 4: 1406 observations,    complexity param=0.011086
##   mean=5.40256, MSE=0.526423 
##   left son=8 (182 obs) right son=9 (1224 obs)
##   Primary splits:
##       volatile.acidity     < 0.4225   to the right, improve=0.04703189, (0 missing)
##       free.sulfur.dioxide  < 17.5     to the left,  improve=0.04607770, (0 missing)
##       total.sulfur.dioxide < 86.5     to the left,  improve=0.02894310, (0 missing)
##       alcohol              < 10.25    to the left,  improve=0.02890077, (0 missing)
##       chlorides            < 0.0455   to the right, improve=0.02096635, (0 missing)
##   Surrogate splits:
##       density       < 0.99107  to the left,  agree=0.874, adj=0.027, (0 split)
##       citric.acid   < 0.11     to the left,  agree=0.873, adj=0.022, (0 split)
##       fixed.acidity < 9.85     to the right, agree=0.873, adj=0.016, (0 split)
##       chlorides     < 0.206    to the right, agree=0.871, adj=0.005, (0 split)
## 
## Node number 5: 1067 observations
##   mean=5.881912, MSE=0.591491 
## 
## Node number 6: 93 observations
##   mean=5.473118, MSE=1.066482 
## 
## Node number 7: 1184 observations,    complexity param=0.01655575
##   mean=6.499155, MSE=0.7432425 
##   left son=14 (611 obs) right son=15 (573 obs)
##   Primary splits:
##       alcohol        < 11.85    to the left,  improve=0.05907511, (0 missing)
##       fixed.acidity  < 7.35     to the right, improve=0.04400660, (0 missing)
##       density        < 0.991395 to the right, improve=0.02522410, (0 missing)
##       residual.sugar < 1.225    to the left,  improve=0.02503936, (0 missing)
##       pH             < 3.245    to the left,  improve=0.02417936, (0 missing)
##   Surrogate splits:
##       density              < 0.991115 to the right, agree=0.710, adj=0.401, (0 split)
##       volatile.acidity     < 0.2675   to the left,  agree=0.665, adj=0.307, (0 split)
##       chlorides            < 0.0365   to the right, agree=0.631, adj=0.237, (0 split)
##       total.sulfur.dioxide < 126.5    to the right, agree=0.566, adj=0.103, (0 split)
##       residual.sugar       < 1.525    to the left,  agree=0.560, adj=0.091, (0 split)
## 
## Node number 8: 182 observations
##   mean=4.994505, MSE=0.5109588 
## 
## Node number 9: 1224 observations
##   mean=5.463235, MSE=0.5002823 
## 
## Node number 14: 611 observations
##   mean=6.296236, MSE=0.7322117 
## 
## Node number 15: 573 observations
##   mean=6.715532, MSE=0.6642788

library(rpart.plot) #подключаем библиотеку rpart plot

## Warning: package 'rpart.plot' was built under R version 3.2.5

rpart.plot(m.rpart, digits = 3) #строим регрессионое дерево по модели m.rpart разррядностью 3

rpart.plot(m.rpart, digits = 4, fallen.leaves = TRUE, type = 3, extra = 101) #строим регрессионное дерево по модели m.rpart с разрядностью 4, с выровненными нижними узлами, изменённым способом решения, с подписями узлов (прописано соответственно)

#4 оценка эффективности модели
p.rpart <- predict(m.rpart, wine_test) #используем прогнозирующую функцию
summary(p.rpart)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.995   5.463   5.882   5.999   6.296   6.716

summary(wine_test$quality)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   3.000   5.000   6.000   5.848   6.000   8.000

cor(p.rpart, wine_test$quality) #смотрим корреляцию между реальными данными и прогнозируемыми

## [1] 0.4931608

MAE <- function(actual, predicted) {
mean(abs(actual - predicted))
}
MAE(p.rpart, wine_test$quality)#средняя абсолютная погрешность

## [1] 0.5732104

#5 повышение эффективности модели
library(rJava) #подключаем библиотеку rJava

## Warning: package 'rJava' was built under R version 3.2.3

library(RWeka) #подключаем библиотеку RWeka

## Warning: package 'RWeka' was built under R version 3.2.5

m.m5p <- M5P(quality ~ ., data = wine_train) #строим модельное дерево
m.m5p #выводим модельное дерево

## M5 pruned model tree:
## (using smoothed linear models)
## 
## alcohol <= 10.85 : LM1 (2473/77.476%)
## alcohol >  10.85 : 
## |   free.sulfur.dioxide <= 20.5 : 
## |   |   free.sulfur.dioxide <= 10.5 : LM2 (81/104.574%)
## |   |   free.sulfur.dioxide >  10.5 : LM3 (224/87.002%)
## |   free.sulfur.dioxide >  20.5 : LM4 (972/84.073%)
## 
## LM num: 1
## quality = 
##  0.0777 * fixed.acidity 
##  - 2.3087 * volatile.acidity 
##  + 0.0732 * residual.sugar 
##  + 0.0022 * free.sulfur.dioxide 
##  - 155.0175 * density 
##  + 0.6462 * pH 
##  + 0.7923 * sulphates 
##  + 0.0758 * alcohol 
##  + 156.2102
## 
## LM num: 2
## quality = 
##  -0.0314 * fixed.acidity 
##  - 0.3415 * volatile.acidity 
##  + 1.7929 * citric.acid 
##  + 0.1316 * residual.sugar 
##  - 0.2456 * chlorides 
##  + 0.1212 * free.sulfur.dioxide 
##  - 178.6281 * density 
##  + 0.054 * pH 
##  + 0.1392 * sulphates 
##  + 0.0108 * alcohol 
##  + 180.6069
## 
## LM num: 3
## quality = 
##  -0.2019 * fixed.acidity 
##  - 2.3804 * volatile.acidity 
##  - 1.0851 * citric.acid 
##  + 0.0905 * residual.sugar 
##  - 0.2456 * chlorides 
##  + 0.0041 * free.sulfur.dioxide 
##  - 177.078 * density 
##  + 0.054 * pH 
##  + 0.0868 * sulphates 
##  + 0.0108 * alcohol 
##  + 183.5076
## 
## LM num: 4
## quality = 
##  0.0004 * fixed.acidity 
##  - 0.0325 * volatile.acidity 
##  + 0.0957 * residual.sugar 
##  - 5.9702 * chlorides 
##  + 0.0002 * free.sulfur.dioxide 
##  - 172.3931 * density 
##  + 1.0123 * pH 
##  + 1.1653 * sulphates 
##  + 0.1542 * alcohol 
##  + 171.6842
## 
## Number of Rules : 4

summary(m.m5p) #смотрим подробную информацию о модельном дереве

## 
## === Summary ===
## 
## Correlation coefficient                  0.5932
## Mean absolute error                      0.5804
## Root mean squared error                  0.7367
## Relative absolute error                 83.3671 %
## Root relative squared error             80.507  %
## Total Number of Instances             3750

p.m5p <- predict(m.m5p, wine_test) #используем прогнозирующую функцию
summary(p.m5p) #смотрим подробную информацию о модельном дереве

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.170   5.646   6.032   6.079   6.501   7.913

cor(p.m5p, wine_test$quality) #смотрим корреляцию между реальными данными и прогнозируемыми

## [1] 0.531723

MAE(wine_test$quality, p.m5p) #средняя абсолютная погрешность

## [1] 0.5660352

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.