Librerías csv

library(readr) # Para importar datos
library(dplyr) # Para filtrar   
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(knitr) # Para datos tabulares
library(ggplot2) # Para visualizar
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(caret)  # Para particionar
## Loading required package: lattice
library(Metrics) # Para determinar rmse
## 
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
library(rpart) # Para árbol
library(rpart.plot) # Para árbol
datos <- read.csv("https://raw.githubusercontent.com/rpizarrog/Analisis-Inteligente-de-datos/main/datos/Advertising.csv")
head(datos,10)
##     X    TV Radio Newspaper Sales
## 1   1 230.1  37.8      69.2  22.1
## 2   2  44.5  39.3      45.1  10.4
## 3   3  17.2  45.9      69.3   9.3
## 4   4 151.5  41.3      58.5  18.5
## 5   5 180.8  10.8      58.4  12.9
## 6   6   8.7  48.9      75.0   7.2
## 7   7  57.5  32.8      23.5  11.8
## 8   8 120.2  19.6      11.6  13.2
## 9   9   8.6   2.1       1.0   4.8
## 10 10 199.8   2.6      21.2  10.6

Explorando los datos

summary(datos)
##        X                TV             Radio          Newspaper     
##  Min.   :  1.00   Min.   :  0.70   Min.   : 0.000   Min.   :  0.30  
##  1st Qu.: 50.75   1st Qu.: 74.38   1st Qu.: 9.975   1st Qu.: 12.75  
##  Median :100.50   Median :149.75   Median :22.900   Median : 25.75  
##  Mean   :100.50   Mean   :147.04   Mean   :23.264   Mean   : 30.55  
##  3rd Qu.:150.25   3rd Qu.:218.82   3rd Qu.:36.525   3rd Qu.: 45.10  
##  Max.   :200.00   Max.   :296.40   Max.   :49.600   Max.   :114.00  
##      Sales      
##  Min.   : 1.60  
##  1st Qu.:10.38  
##  Median :12.90  
##  Mean   :14.02  
##  3rd Qu.:17.40  
##  Max.   :27.00
str(datos)
## 'data.frame':    200 obs. of  5 variables:
##  $ X        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ TV       : num  230.1 44.5 17.2 151.5 180.8 ...
##  $ Radio    : num  37.8 39.3 45.9 41.3 10.8 48.9 32.8 19.6 2.1 2.6 ...
##  $ Newspaper: num  69.2 45.1 69.3 58.5 58.4 75 23.5 11.6 1 21.2 ...
##  $ Sales    : num  22.1 10.4 9.3 18.5 12.9 7.2 11.8 13.2 4.8 10.6 ...

Limpiar datos

datos <- datos %>% select (TV, Radio, Newspaper, Sales)
datos
##        TV Radio Newspaper Sales
## 1   230.1  37.8      69.2  22.1
## 2    44.5  39.3      45.1  10.4
## 3    17.2  45.9      69.3   9.3
## 4   151.5  41.3      58.5  18.5
## 5   180.8  10.8      58.4  12.9
## 6     8.7  48.9      75.0   7.2
## 7    57.5  32.8      23.5  11.8
## 8   120.2  19.6      11.6  13.2
## 9     8.6   2.1       1.0   4.8
## 10  199.8   2.6      21.2  10.6
## 11   66.1   5.8      24.2   8.6
## 12  214.7  24.0       4.0  17.4
## 13   23.8  35.1      65.9   9.2
## 14   97.5   7.6       7.2   9.7
## 15  204.1  32.9      46.0  19.0
## 16  195.4  47.7      52.9  22.4
## 17   67.8  36.6     114.0  12.5
## 18  281.4  39.6      55.8  24.4
## 19   69.2  20.5      18.3  11.3
## 20  147.3  23.9      19.1  14.6
## 21  218.4  27.7      53.4  18.0
## 22  237.4   5.1      23.5  12.5
## 23   13.2  15.9      49.6   5.6
## 24  228.3  16.9      26.2  15.5
## 25   62.3  12.6      18.3   9.7
## 26  262.9   3.5      19.5  12.0
## 27  142.9  29.3      12.6  15.0
## 28  240.1  16.7      22.9  15.9
## 29  248.8  27.1      22.9  18.9
## 30   70.6  16.0      40.8  10.5
## 31  292.9  28.3      43.2  21.4
## 32  112.9  17.4      38.6  11.9
## 33   97.2   1.5      30.0   9.6
## 34  265.6  20.0       0.3  17.4
## 35   95.7   1.4       7.4   9.5
## 36  290.7   4.1       8.5  12.8
## 37  266.9  43.8       5.0  25.4
## 38   74.7  49.4      45.7  14.7
## 39   43.1  26.7      35.1  10.1
## 40  228.0  37.7      32.0  21.5
## 41  202.5  22.3      31.6  16.6
## 42  177.0  33.4      38.7  17.1
## 43  293.6  27.7       1.8  20.7
## 44  206.9   8.4      26.4  12.9
## 45   25.1  25.7      43.3   8.5
## 46  175.1  22.5      31.5  14.9
## 47   89.7   9.9      35.7  10.6
## 48  239.9  41.5      18.5  23.2
## 49  227.2  15.8      49.9  14.8
## 50   66.9  11.7      36.8   9.7
## 51  199.8   3.1      34.6  11.4
## 52  100.4   9.6       3.6  10.7
## 53  216.4  41.7      39.6  22.6
## 54  182.6  46.2      58.7  21.2
## 55  262.7  28.8      15.9  20.2
## 56  198.9  49.4      60.0  23.7
## 57    7.3  28.1      41.4   5.5
## 58  136.2  19.2      16.6  13.2
## 59  210.8  49.6      37.7  23.8
## 60  210.7  29.5       9.3  18.4
## 61   53.5   2.0      21.4   8.1
## 62  261.3  42.7      54.7  24.2
## 63  239.3  15.5      27.3  15.7
## 64  102.7  29.6       8.4  14.0
## 65  131.1  42.8      28.9  18.0
## 66   69.0   9.3       0.9   9.3
## 67   31.5  24.6       2.2   9.5
## 68  139.3  14.5      10.2  13.4
## 69  237.4  27.5      11.0  18.9
## 70  216.8  43.9      27.2  22.3
## 71  199.1  30.6      38.7  18.3
## 72  109.8  14.3      31.7  12.4
## 73   26.8  33.0      19.3   8.8
## 74  129.4   5.7      31.3  11.0
## 75  213.4  24.6      13.1  17.0
## 76   16.9  43.7      89.4   8.7
## 77   27.5   1.6      20.7   6.9
## 78  120.5  28.5      14.2  14.2
## 79    5.4  29.9       9.4   5.3
## 80  116.0   7.7      23.1  11.0
## 81   76.4  26.7      22.3  11.8
## 82  239.8   4.1      36.9  12.3
## 83   75.3  20.3      32.5  11.3
## 84   68.4  44.5      35.6  13.6
## 85  213.5  43.0      33.8  21.7
## 86  193.2  18.4      65.7  15.2
## 87   76.3  27.5      16.0  12.0
## 88  110.7  40.6      63.2  16.0
## 89   88.3  25.5      73.4  12.9
## 90  109.8  47.8      51.4  16.7
## 91  134.3   4.9       9.3  11.2
## 92   28.6   1.5      33.0   7.3
## 93  217.7  33.5      59.0  19.4
## 94  250.9  36.5      72.3  22.2
## 95  107.4  14.0      10.9  11.5
## 96  163.3  31.6      52.9  16.9
## 97  197.6   3.5       5.9  11.7
## 98  184.9  21.0      22.0  15.5
## 99  289.7  42.3      51.2  25.4
## 100 135.2  41.7      45.9  17.2
## 101 222.4   4.3      49.8  11.7
## 102 296.4  36.3     100.9  23.8
## 103 280.2  10.1      21.4  14.8
## 104 187.9  17.2      17.9  14.7
## 105 238.2  34.3       5.3  20.7
## 106 137.9  46.4      59.0  19.2
## 107  25.0  11.0      29.7   7.2
## 108  90.4   0.3      23.2   8.7
## 109  13.1   0.4      25.6   5.3
## 110 255.4  26.9       5.5  19.8
## 111 225.8   8.2      56.5  13.4
## 112 241.7  38.0      23.2  21.8
## 113 175.7  15.4       2.4  14.1
## 114 209.6  20.6      10.7  15.9
## 115  78.2  46.8      34.5  14.6
## 116  75.1  35.0      52.7  12.6
## 117 139.2  14.3      25.6  12.2
## 118  76.4   0.8      14.8   9.4
## 119 125.7  36.9      79.2  15.9
## 120  19.4  16.0      22.3   6.6
## 121 141.3  26.8      46.2  15.5
## 122  18.8  21.7      50.4   7.0
## 123 224.0   2.4      15.6  11.6
## 124 123.1  34.6      12.4  15.2
## 125 229.5  32.3      74.2  19.7
## 126  87.2  11.8      25.9  10.6
## 127   7.8  38.9      50.6   6.6
## 128  80.2   0.0       9.2   8.8
## 129 220.3  49.0       3.2  24.7
## 130  59.6  12.0      43.1   9.7
## 131   0.7  39.6       8.7   1.6
## 132 265.2   2.9      43.0  12.7
## 133   8.4  27.2       2.1   5.7
## 134 219.8  33.5      45.1  19.6
## 135  36.9  38.6      65.6  10.8
## 136  48.3  47.0       8.5  11.6
## 137  25.6  39.0       9.3   9.5
## 138 273.7  28.9      59.7  20.8
## 139  43.0  25.9      20.5   9.6
## 140 184.9  43.9       1.7  20.7
## 141  73.4  17.0      12.9  10.9
## 142 193.7  35.4      75.6  19.2
## 143 220.5  33.2      37.9  20.1
## 144 104.6   5.7      34.4  10.4
## 145  96.2  14.8      38.9  11.4
## 146 140.3   1.9       9.0  10.3
## 147 240.1   7.3       8.7  13.2
## 148 243.2  49.0      44.3  25.4
## 149  38.0  40.3      11.9  10.9
## 150  44.7  25.8      20.6  10.1
## 151 280.7  13.9      37.0  16.1
## 152 121.0   8.4      48.7  11.6
## 153 197.6  23.3      14.2  16.6
## 154 171.3  39.7      37.7  19.0
## 155 187.8  21.1       9.5  15.6
## 156   4.1  11.6       5.7   3.2
## 157  93.9  43.5      50.5  15.3
## 158 149.8   1.3      24.3  10.1
## 159  11.7  36.9      45.2   7.3
## 160 131.7  18.4      34.6  12.9
## 161 172.5  18.1      30.7  14.4
## 162  85.7  35.8      49.3  13.3
## 163 188.4  18.1      25.6  14.9
## 164 163.5  36.8       7.4  18.0
## 165 117.2  14.7       5.4  11.9
## 166 234.5   3.4      84.8  11.9
## 167  17.9  37.6      21.6   8.0
## 168 206.8   5.2      19.4  12.2
## 169 215.4  23.6      57.6  17.1
## 170 284.3  10.6       6.4  15.0
## 171  50.0  11.6      18.4   8.4
## 172 164.5  20.9      47.4  14.5
## 173  19.6  20.1      17.0   7.6
## 174 168.4   7.1      12.8  11.7
## 175 222.4   3.4      13.1  11.5
## 176 276.9  48.9      41.8  27.0
## 177 248.4  30.2      20.3  20.2
## 178 170.2   7.8      35.2  11.7
## 179 276.7   2.3      23.7  11.8
## 180 165.6  10.0      17.6  12.6
## 181 156.6   2.6       8.3  10.5
## 182 218.5   5.4      27.4  12.2
## 183  56.2   5.7      29.7   8.7
## 184 287.6  43.0      71.8  26.2
## 185 253.8  21.3      30.0  17.6
## 186 205.0  45.1      19.6  22.6
## 187 139.5   2.1      26.6  10.3
## 188 191.1  28.7      18.2  17.3
## 189 286.0  13.9       3.7  15.9
## 190  18.7  12.1      23.4   6.7
## 191  39.5  41.1       5.8  10.8
## 192  75.5  10.8       6.0   9.9
## 193  17.2   4.1      31.6   5.9
## 194 166.8  42.0       3.6  19.6
## 195 149.7  35.6       6.0  17.3
## 196  38.2   3.7      13.8   7.6
## 197  94.2   4.9       8.1   9.7
## 198 177.0   9.3       6.4  12.8
## 199 283.6  42.0      66.2  25.5
## 200 232.1   8.6       8.7  13.4

Head(datos)

head(datos, 20)
##       TV Radio Newspaper Sales
## 1  230.1  37.8      69.2  22.1
## 2   44.5  39.3      45.1  10.4
## 3   17.2  45.9      69.3   9.3
## 4  151.5  41.3      58.5  18.5
## 5  180.8  10.8      58.4  12.9
## 6    8.7  48.9      75.0   7.2
## 7   57.5  32.8      23.5  11.8
## 8  120.2  19.6      11.6  13.2
## 9    8.6   2.1       1.0   4.8
## 10 199.8   2.6      21.2  10.6
## 11  66.1   5.8      24.2   8.6
## 12 214.7  24.0       4.0  17.4
## 13  23.8  35.1      65.9   9.2
## 14  97.5   7.6       7.2   9.7
## 15 204.1  32.9      46.0  19.0
## 16 195.4  47.7      52.9  22.4
## 17  67.8  36.6     114.0  12.5
## 18 281.4  39.6      55.8  24.4
## 19  69.2  20.5      18.3  11.3
## 20 147.3  23.9      19.1  14.6

Tail(datos)

tail(datos, 20)
##        TV Radio Newspaper Sales
## 181 156.6   2.6       8.3  10.5
## 182 218.5   5.4      27.4  12.2
## 183  56.2   5.7      29.7   8.7
## 184 287.6  43.0      71.8  26.2
## 185 253.8  21.3      30.0  17.6
## 186 205.0  45.1      19.6  22.6
## 187 139.5   2.1      26.6  10.3
## 188 191.1  28.7      18.2  17.3
## 189 286.0  13.9       3.7  15.9
## 190  18.7  12.1      23.4   6.7
## 191  39.5  41.1       5.8  10.8
## 192  75.5  10.8       6.0   9.9
## 193  17.2   4.1      31.6   5.9
## 194 166.8  42.0       3.6  19.6
## 195 149.7  35.6       6.0  17.3
## 196  38.2   3.7      13.8   7.6
## 197  94.2   4.9       8.1   9.7
## 198 177.0   9.3       6.4  12.8
## 199 283.6  42.0      66.2  25.5
## 200 232.1   8.6       8.7  13.4

Datos de entrenamiento y validación

Datos de entrenamiento

n <- nrow(datos)
set.seed(1550)
entrena <- createDataPartition(y = datos$Sales, p = 0.70, list = FALSE, times = 1)

# Datos entrenamiento
datos.entrenamiento <- datos[entrena, ]  # [renglones, columna]

# Datos validación
datos.validacion <- datos[-entrena, ]

Tail

kable(tail(datos.entrenamiento, 20), caption = "Datos de entrenamiento ültimos 20 registros")
Datos de entrenamiento ültimos 20 registros
TV Radio Newspaper Sales
179 276.7 2.3 23.7 11.8
180 165.6 10.0 17.6 12.6
181 156.6 2.6 8.3 10.5
184 287.6 43.0 71.8 26.2
185 253.8 21.3 30.0 17.6
186 205.0 45.1 19.6 22.6
187 139.5 2.1 26.6 10.3
188 191.1 28.7 18.2 17.3
189 286.0 13.9 3.7 15.9
190 18.7 12.1 23.4 6.7
191 39.5 41.1 5.8 10.8
192 75.5 10.8 6.0 9.9
193 17.2 4.1 31.6 5.9
194 166.8 42.0 3.6 19.6
195 149.7 35.6 6.0 17.3
196 38.2 3.7 13.8 7.6
197 94.2 4.9 8.1 9.7
198 177.0 9.3 6.4 12.8
199 283.6 42.0 66.2 25.5
200 232.1 8.6 8.7 13.4

Datos de validación

Head

kable(head(datos.validacion, 20), caption = "Datos de Validación Primeros 20 registros")
Datos de Validación Primeros 20 registros
TV Radio Newspaper Sales
1 230.1 37.8 69.2 22.1
2 44.5 39.3 45.1 10.4
6 8.7 48.9 75.0 7.2
8 120.2 19.6 11.6 13.2
11 66.1 5.8 24.2 8.6
12 214.7 24.0 4.0 17.4
13 23.8 35.1 65.9 9.2
14 97.5 7.6 7.2 9.7
18 281.4 39.6 55.8 24.4
19 69.2 20.5 18.3 11.3
20 147.3 23.9 19.1 14.6
21 218.4 27.7 53.4 18.0
22 237.4 5.1 23.5 12.5
30 70.6 16.0 40.8 10.5
34 265.6 20.0 0.3 17.4
38 74.7 49.4 45.7 14.7
40 228.0 37.7 32.0 21.5
41 202.5 22.3 31.6 16.6
43 293.6 27.7 1.8 20.7
45 25.1 25.7 43.3 8.5

Tail

kable(tail(datos.validacion, 20), caption = "Datos de validació últimos 20 registros")
Datos de validació últimos 20 registros
TV Radio Newspaper Sales
109 13.1 0.4 25.6 5.3
110 255.4 26.9 5.5 19.8
114 209.6 20.6 10.7 15.9
116 75.1 35.0 52.7 12.6
121 141.3 26.8 46.2 15.5
130 59.6 12.0 43.1 9.7
131 0.7 39.6 8.7 1.6
132 265.2 2.9 43.0 12.7
134 219.8 33.5 45.1 19.6
138 273.7 28.9 59.7 20.8
144 104.6 5.7 34.4 10.4
153 197.6 23.3 14.2 16.6
154 171.3 39.7 37.7 19.0
158 149.8 1.3 24.3 10.1
165 117.2 14.7 5.4 11.9
168 206.8 5.2 19.4 12.2
170 284.3 10.6 6.4 15.0
175 222.4 3.4 13.1 11.5
182 218.5 5.4 27.4 12.2
183 56.2 5.7 29.7 8.7

Construir el modelo

modelo_ar <- rpart(data = datos.entrenamiento,formula = Sales ~ TV + Radio + Newspaper )
modelo_ar
## n= 142 
## 
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 142 3930.13500 14.061970  
##    2) TV< 101.55 50  335.17780  9.262000  
##      4) TV< 30.05 19   44.27684  6.726316 *
##      5) TV>=30.05 31   93.86194 10.816130  
##       10) Radio< 29.75 21   27.26952 10.004760 *
##       11) Radio>=29.75 10   23.73600 12.520000 *
##    3) TV>=101.55 92 1816.89100 16.670650  
##      6) Radio< 21.2 45  132.65110 13.044440  
##       12) Radio< 10.05 24   20.75958 11.820830 *
##       13) Radio>=10.05 21   34.89143 14.442860 *
##      7) Radio>=21.2 47  525.97490 20.142550  
##       14) TV< 179.8 14   41.55214 16.764290 *
##       15) TV>=179.8 33  256.86060 21.575760  
##         30) Radio< 35.85 14   21.70857 18.871430 *
##         31) Radio>=35.85 19   57.32105 23.568420 *

Resumen

summary(modelo_ar)
## Call:
## rpart(formula = Sales ~ TV + Radio + Newspaper, data = datos.entrenamiento)
##   n= 142 
## 
##           CP nsplit  rel error    xerror       xstd
## 1 0.45241862      0 1.00000000 1.0058898 0.10594717
## 2 0.29471376      1 0.54758138 0.6479956 0.06268588
## 3 0.05790187      2 0.25286762 0.3348100 0.03642435
## 4 0.05013544      3 0.19496575 0.3168308 0.03257395
## 5 0.04524806      4 0.14483031 0.2803904 0.02836161
## 6 0.01959223      5 0.09958225 0.2108761 0.02470429
## 7 0.01090457      6 0.07999002 0.1587899 0.01951078
## 8 0.01000000      7 0.06908546 0.1497569 0.01685195
## 
## Variable importance
##        TV     Radio Newspaper 
##        53        36        11 
## 
## Node number 1: 142 observations,    complexity param=0.4524186
##   mean=14.06197, MSE=27.677 
##   left son=2 (50 obs) right son=3 (92 obs)
##   Primary splits:
##       TV        < 101.55 to the left,  improve=0.4524186, (0 missing)
##       Radio     < 41.2   to the left,  improve=0.3184424, (0 missing)
##       Newspaper < 51     to the left,  improve=0.1033655, (0 missing)
##   Surrogate splits:
##       Radio     < 1.7    to the left,  agree=0.683, adj=0.10, (0 split)
##       Newspaper < 2.3    to the left,  agree=0.669, adj=0.06, (0 split)
## 
## Node number 2: 50 observations,    complexity param=0.05013544
##   mean=9.262, MSE=6.703556 
##   left son=4 (19 obs) right son=5 (31 obs)
##   Primary splits:
##       TV        < 30.05  to the left,  improve=0.58786420, (0 missing)
##       Radio     < 31.35  to the left,  improve=0.19123480, (0 missing)
##       Newspaper < 33.75  to the left,  improve=0.08279888, (0 missing)
##   Surrogate splits:
##       Newspaper < 40.15  to the right, agree=0.66, adj=0.105, (0 split)
## 
## Node number 3: 92 observations,    complexity param=0.2947138
##   mean=16.67065, MSE=19.74881 
##   left son=6 (45 obs) right son=7 (47 obs)
##   Primary splits:
##       Radio     < 21.2   to the left,  improve=0.6374983, (0 missing)
##       TV        < 181.7  to the left,  improve=0.2020322, (0 missing)
##       Newspaper < 37.35  to the left,  improve=0.1365813, (0 missing)
##   Surrogate splits:
##       Newspaper < 37.35  to the left,  agree=0.652, adj=0.289, (0 split)
##       TV        < 189.75 to the left,  agree=0.598, adj=0.178, (0 split)
## 
## Node number 4: 19 observations
##   mean=6.726316, MSE=2.33036 
## 
## Node number 5: 31 observations,    complexity param=0.01090457
##   mean=10.81613, MSE=3.027804 
##   left son=10 (21 obs) right son=11 (10 obs)
##   Primary splits:
##       Radio     < 29.75  to the left,  improve=0.4565899, (0 missing)
##       Newspaper < 32.25  to the left,  improve=0.4512462, (0 missing)
##       TV        < 53.75  to the left,  improve=0.1207227, (0 missing)
##   Surrogate splits:
##       Newspaper < 44.1   to the left,  agree=0.774, adj=0.3, (0 split)
##       TV        < 38.1   to the right, agree=0.710, adj=0.1, (0 split)
## 
## Node number 6: 45 observations,    complexity param=0.01959223
##   mean=13.04444, MSE=2.947802 
##   left son=12 (24 obs) right son=13 (21 obs)
##   Primary splits:
##       Radio     < 10.05  to the left,  improve=0.58047080, (0 missing)
##       TV        < 171.35 to the left,  improve=0.27832210, (0 missing)
##       Newspaper < 21.3   to the left,  improve=0.02044107, (0 missing)
##   Surrogate splits:
##       Newspaper < 21.3   to the left,  agree=0.622, adj=0.190, (0 split)
##       TV        < 196.5  to the right, agree=0.600, adj=0.143, (0 split)
## 
## Node number 7: 47 observations,    complexity param=0.05790187
##   mean=20.14255, MSE=11.19096 
##   left son=14 (14 obs) right son=15 (33 obs)
##   Primary splits:
##       TV        < 179.8  to the left,  improve=0.4326483, (0 missing)
##       Radio     < 41.4   to the left,  improve=0.4245020, (0 missing)
##       Newspaper < 18.35  to the left,  improve=0.1337402, (0 missing)
##   Surrogate splits:
##       Newspaper < 8.85   to the left,  agree=0.723, adj=0.071, (0 split)
## 
## Node number 10: 21 observations
##   mean=10.00476, MSE=1.298549 
## 
## Node number 11: 10 observations
##   mean=12.52, MSE=2.3736 
## 
## Node number 12: 24 observations
##   mean=11.82083, MSE=0.8649826 
## 
## Node number 13: 21 observations
##   mean=14.44286, MSE=1.661497 
## 
## Node number 14: 14 observations
##   mean=16.76429, MSE=2.96801 
## 
## Node number 15: 33 observations,    complexity param=0.04524806
##   mean=21.57576, MSE=7.783655 
##   left son=30 (14 obs) right son=31 (19 obs)
##   Primary splits:
##       Radio     < 35.85  to the left,  improve=0.69232490, (0 missing)
##       TV        < 257.55 to the left,  improve=0.35640150, (0 missing)
##       Newspaper < 31.9   to the left,  improve=0.08238753, (0 missing)
##   Surrogate splits:
##       TV        < 238.65 to the left,  agree=0.606, adj=0.071, (0 split)
##       Newspaper < 18.35  to the left,  agree=0.606, adj=0.071, (0 split)
## 
## Node number 30: 14 observations
##   mean=18.87143, MSE=1.550612 
## 
## Node number 31: 19 observations
##   mean=23.56842, MSE=3.016898

Plot del árbol de regresión

rpart.plot(modelo_ar)

Predecir valores con datos de validación

predicciones <- predict(object = modelo_ar, newdata = datos.validacion)
comparaciones <- data.frame(datos.validacion, predicciones)
comparaciones
##        TV Radio Newspaper Sales predicciones
## 1   230.1  37.8      69.2  22.1    23.568421
## 2    44.5  39.3      45.1  10.4    12.520000
## 6     8.7  48.9      75.0   7.2     6.726316
## 8   120.2  19.6      11.6  13.2    14.442857
## 11   66.1   5.8      24.2   8.6    10.004762
## 12  214.7  24.0       4.0  17.4    18.871429
## 13   23.8  35.1      65.9   9.2     6.726316
## 14   97.5   7.6       7.2   9.7    10.004762
## 18  281.4  39.6      55.8  24.4    23.568421
## 19   69.2  20.5      18.3  11.3    10.004762
## 20  147.3  23.9      19.1  14.6    16.764286
## 21  218.4  27.7      53.4  18.0    18.871429
## 22  237.4   5.1      23.5  12.5    11.820833
## 30   70.6  16.0      40.8  10.5    10.004762
## 34  265.6  20.0       0.3  17.4    14.442857
## 38   74.7  49.4      45.7  14.7    12.520000
## 40  228.0  37.7      32.0  21.5    23.568421
## 41  202.5  22.3      31.6  16.6    18.871429
## 43  293.6  27.7       1.8  20.7    18.871429
## 45   25.1  25.7      43.3   8.5     6.726316
## 46  175.1  22.5      31.5  14.9    16.764286
## 50   66.9  11.7      36.8   9.7    10.004762
## 53  216.4  41.7      39.6  22.6    23.568421
## 55  262.7  28.8      15.9  20.2    18.871429
## 58  136.2  19.2      16.6  13.2    14.442857
## 61   53.5   2.0      21.4   8.1    10.004762
## 71  199.1  30.6      38.7  18.3    18.871429
## 73   26.8  33.0      19.3   8.8     6.726316
## 77   27.5   1.6      20.7   6.9     6.726316
## 83   75.3  20.3      32.5  11.3    10.004762
## 87   76.3  27.5      16.0  12.0    10.004762
## 95  107.4  14.0      10.9  11.5    14.442857
## 96  163.3  31.6      52.9  16.9    16.764286
## 97  197.6   3.5       5.9  11.7    11.820833
## 99  289.7  42.3      51.2  25.4    23.568421
## 100 135.2  41.7      45.9  17.2    16.764286
## 105 238.2  34.3       5.3  20.7    18.871429
## 108  90.4   0.3      23.2   8.7    10.004762
## 109  13.1   0.4      25.6   5.3     6.726316
## 110 255.4  26.9       5.5  19.8    18.871429
## 114 209.6  20.6      10.7  15.9    14.442857
## 116  75.1  35.0      52.7  12.6    12.520000
## 121 141.3  26.8      46.2  15.5    16.764286
## 130  59.6  12.0      43.1   9.7    10.004762
## 131   0.7  39.6       8.7   1.6     6.726316
## 132 265.2   2.9      43.0  12.7    11.820833
## 134 219.8  33.5      45.1  19.6    18.871429
## 138 273.7  28.9      59.7  20.8    18.871429
## 144 104.6   5.7      34.4  10.4    11.820833
## 153 197.6  23.3      14.2  16.6    18.871429
## 154 171.3  39.7      37.7  19.0    16.764286
## 158 149.8   1.3      24.3  10.1    11.820833
## 165 117.2  14.7       5.4  11.9    14.442857
## 168 206.8   5.2      19.4  12.2    11.820833
## 170 284.3  10.6       6.4  15.0    14.442857
## 175 222.4   3.4      13.1  11.5    11.820833
## 182 218.5   5.4      27.4  12.2    11.820833
## 183  56.2   5.7      29.7   8.7    10.004762

RMSE

rmse <- rmse(actual = comparaciones$Sales, predicted = comparaciones$predicciones)
rmse
## [1] 1.649371

Graficar predicciones contra valores reales

ggplot(data = comparaciones) +
  geom_line(aes(x = 1:nrow(comparaciones), y = Sales), col='blue') +
  geom_line(aes(x = 1:nrow(comparaciones), y = predicciones), col='red') +
  ggtitle(label="Valores reales vs predichos Adverstising", subtitle = "Arbol de Regresión") 

Predicciones con datos nuevos

TV <- c(140, 160)
Radio <- c(60, 40)
Newspaper <- c(80, 90) 

nuevos <- data.frame(TV, Radio, Newspaper)  
nuevos
##    TV Radio Newspaper
## 1 140    60        80
## 2 160    40        90
Y.predicciones <- predict(object = modelo_ar, newdata = nuevos)
Y.predicciones
##        1        2 
## 16.76429 16.76429

Interpretación

Usando la semilla de 1550 y una partición de datos del 70/30 en datos de entrenamiento y validación, obtuve los siguientes datos:

rpart.plot(modelo_ar)

El punto de inflexión parte de lo invertido en TV, según los datos recolectados, todo caso en el que se invertía menos de 102 millones, resultaba en ventas menores a las 14 unidades, siendo la media de ventas un 14.02, dejando un 35% de los datos en inversiones menores a los 102, y un 65% a las inversiones mayores a 102, esto exclusivamente para las inversiones de TV.

Para igualar el peor dato de ventas, el único dato importante es el de TV, ya que se tendría que invertir menos de 30 en dicho dato, para tener una media de 6.7, con un índice de dispersión de 1.649371 por arriba o por debajo. Este dato es cumplido por un 13% del total de los datos.

Mientras que para alcanzar el mayor dato de ventas, són indispensables 2 campos, el de la TV y el de la Radio. Necesitando un mínimo de 180 y 36 de inversión en TV y Radio respectivamente.

Por último obtuve que el índice de dispersión es de 1.649371.

rmse
## [1] 1.649371