Librerías y csv

library(lattice)
library(readr)   # Sirve para importar datos
library(Metrics) # Sirve para construir métricas y valorar modelos 
library (ggplot2) # Sirve para gráficos
library(caret)    # Para partir los datos Entrenamiento y Validación.
## 
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
## 
##     precision, recall
library(knitr)
datos <- read.csv("https://raw.githubusercontent.com/rpizarrog/Analisis-Inteligente-de-datos/main/datos/Advertising_Web.csv")
summary(datos)
##       X.1               X                TV             Radio       
##  Min.   :  1.00   Min.   :  1.00   Min.   :  0.70   Min.   : 0.000  
##  1st Qu.: 50.75   1st Qu.: 50.75   1st Qu.: 74.38   1st Qu.: 9.975  
##  Median :100.50   Median :100.50   Median :149.75   Median :22.900  
##  Mean   :100.50   Mean   :100.50   Mean   :147.04   Mean   :23.264  
##  3rd Qu.:150.25   3rd Qu.:150.25   3rd Qu.:218.82   3rd Qu.:36.525  
##  Max.   :200.00   Max.   :200.00   Max.   :296.40   Max.   :49.600  
##    Newspaper           Web              Sales      
##  Min.   :  0.30   Min.   :  4.308   Min.   : 1.60  
##  1st Qu.: 12.75   1st Qu.: 99.049   1st Qu.:10.38  
##  Median : 25.75   Median :156.862   Median :12.90  
##  Mean   : 30.55   Mean   :159.587   Mean   :14.02  
##  3rd Qu.: 45.10   3rd Qu.:212.312   3rd Qu.:17.40  
##  Max.   :114.00   Max.   :358.247   Max.   :27.00

Visualizar la dispersión de los datos

ggplot(datos, aes(x = TV, y = Sales)) + 
  geom_point(colour = "blue") + 
  geom_smooth(colour = 'orange', method = lm) +
  geom_smooth(colour = 'red') 
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Datos de entrenamiento y validación

set.seed(2022)
entrena <- createDataPartition(y = datos$Sales, p = 0.80, list = FALSE, times = 1)

# Datos entrenamiento
datos.entrenamiento <- datos[entrena, ]  # [renglones, columna]

# Datos validación
datos.validacion <- datos[-entrena, ]

Entrenamiento

Tail

tail(datos.entrenamiento, 10)
##     X.1   X    TV Radio Newspaper       Web Sales
## 189 189 189 286.0  13.9       3.7 151.99073  15.9
## 190 190 190  18.7  12.1      23.4 222.90695   6.7
## 192 192 192  75.5  10.8       6.0 301.48119   9.9
## 193 193 193  17.2   4.1      31.6 265.02864   5.9
## 194 194 194 166.8  42.0       3.6 192.24621  19.6
## 195 195 195 149.7  35.6       6.0  99.57998  17.3
## 196 196 196  38.2   3.7      13.8 248.84107   7.6
## 197 197 197  94.2   4.9       8.1 118.04186   9.7
## 198 198 198 177.0   9.3       6.4 213.27467  12.8
## 200 200 200 232.1   8.6       8.7 151.99073  13.4

Validación

Head

head(datos.validacion, 10)
##    X.1  X    TV Radio Newspaper       Web Sales
## 10  10 10 199.8   2.6      21.2 111.27226  10.6
## 33  33 33  97.2   1.5      30.0 139.78109   9.6
## 37  37 37 266.9  43.8       5.0  96.31683  25.4
## 49  49 49 227.2  15.8      49.9  75.26918  14.8
## 66  66 66  69.0   9.3       0.9 205.99349   9.3
## 67  67 67  31.5  24.6       2.2 216.47140   9.5
## 70  70 70 216.8  43.9      27.2 149.39610  22.3
## 76  76 76  16.9  43.7      89.4  70.23428   8.7
## 78  78 78 120.5  28.5      14.2  97.45513  14.2
## 83  83 83  75.3  20.3      32.5 231.20983  11.3

Tail

tail(datos.validacion, 10)
##     X.1   X    TV Radio Newspaper       Web Sales
## 149 149 149  38.0  40.3      11.9  75.20798  10.9
## 153 153 153 197.6  23.3      14.2 159.52256  16.6
## 168 168 168 206.8   5.2      19.4 115.37196  12.2
## 169 169 169 215.4  23.6      57.6 203.43127  17.1
## 171 171 171  50.0  11.6      18.4  64.01480   8.4
## 172 172 172 164.5  20.9      47.4  96.18039  14.5
## 179 179 179 276.7   2.3      23.7 137.32377  11.8
## 188 188 188 191.1  28.7      18.2 239.27571  17.3
## 191 191 191  39.5  41.1       5.8 219.89058  10.8
## 199 199 199 283.6  42.0      66.2 237.49806  25.5

Modelo de regresión lineal simple

modelo.simple <- lm(data = datos.entrenamiento, formula = Sales ~ TV)

Resumen del modelo y los coeficientes

resumen <- summary(modelo.simple)
resumen
## 
## Call:
## lm(formula = Sales ~ TV, data = datos.entrenamiento)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.9573 -2.0042 -0.2716  2.0227  7.2679 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 7.060804   0.518127   13.63   <2e-16 ***
## TV          0.047116   0.003031   15.54   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.308 on 160 degrees of freedom
## Multiple R-squared:  0.6016, Adjusted R-squared:  0.5992 
## F-statistic: 241.7 on 1 and 160 DF,  p-value: < 2.2e-16
a <- modelo.simple$coefficients[1]
b <- modelo.simple$coefficients[2]
a; b
## (Intercept) 
##    7.060804
##         TV 
## 0.04711566

R Squared

resumen$r.squared
## [1] 0.6016483

Predicciones

predicciones <- predict(object = modelo.simple, newdata = datos.validacion)

Estimación lineal

ggplot(data = datos.entrenamiento, aes(x = TV, y = Sales)) + 
  geom_point(colour = "blue") + 
  geom_line(aes(x = TV, y = modelo.simple$fitted.values, colour = 'red'))

Generar tabla comparativa

tabla <- data.frame(TV = datos.validacion$TV, Sales.real = datos.validacion$Sales, Sales.predicciones = predicciones)
tabla
##        TV Sales.real Sales.predicciones
## 10  199.8       10.6          16.474512
## 33   97.2        9.6          11.640446
## 37  266.9       25.4          19.635973
## 49  227.2       14.8          17.765481
## 66   69.0        9.3          10.311784
## 67   31.5        9.5           8.544947
## 70  216.8       22.3          17.275478
## 76   16.9        8.7           7.857058
## 78  120.5       14.2          12.738240
## 83   75.3       11.3          10.608613
## 93  217.7       19.4          17.317882
## 95  107.4       11.5          12.121025
## 96  163.3       16.9          14.754791
## 101 222.4       11.7          17.539326
## 105 238.2       20.7          18.283753
## 107  25.0        7.2           8.238695
## 109  13.1        5.3           7.678019
## 114 209.6       15.9          16.936245
## 124 123.1       15.2          12.860741
## 125 229.5       19.7          17.873847
## 128  80.2        8.8          10.839479
## 130  59.6        9.7           9.868897
## 133   8.4        5.7           7.456575
## 134 219.8       19.6          17.416825
## 136  48.3       11.6           9.336490
## 140 184.9       20.7          15.772489
## 143 220.5       20.1          17.449806
## 145  96.2       11.4          11.593330
## 149  38.0       10.9           8.851199
## 153 197.6       16.6          16.370858
## 168 206.8       12.2          16.804322
## 169 215.4       17.1          17.209516
## 171  50.0        8.4           9.416587
## 172 164.5       14.5          14.811329
## 179 276.7       11.8          20.097706
## 188 191.1       17.3          16.064606
## 191  39.5       10.8           8.921872
## 199 283.6       25.5          20.422804

Evaluar predicciones con rmse

rmse.lineal <- rmse(actual = tabla$Sales.real, predicted = tabla$Sales.predicciones)
rmse.lineal
## [1] 3.042904

Modelo de regresión polinomial de segundo nivel

Usando argumento Poly

modelo.poly2 <- lm(data = datos.entrenamiento, formula = Sales ~ poly(x = TV, degree = 2, raw = TRUE))

Usando argumento I

modelo.poly2.I <- lm(Sales ~ TV + I(x = TV ^ 2), data = datos.entrenamiento)

Resumen del modelo

resumen <- summary(modelo.poly2)
resumen
## 
## Call:
## lm(formula = Sales ~ poly(x = TV, degree = 2, raw = TRUE), data = datos.entrenamiento)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.9497 -2.0174 -0.2908  1.9922  7.6601 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)
## (Intercept)                            5.996e+00  7.421e-01   8.079 1.53e-13
## poly(x = TV, degree = 2, raw = TRUE)1  6.959e-02  1.170e-02   5.949 1.66e-08
## poly(x = TV, degree = 2, raw = TRUE)2 -7.727e-05  3.887e-05  -1.988   0.0485
##                                          
## (Intercept)                           ***
## poly(x = TV, degree = 2, raw = TRUE)1 ***
## poly(x = TV, degree = 2, raw = TRUE)2 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.278 on 159 degrees of freedom
## Multiple R-squared:  0.6113, Adjusted R-squared:  0.6064 
## F-statistic:   125 on 2 and 159 DF,  p-value: < 2.2e-16

El valor de los coeficientes con modelo Poly

b0 <- modelo.poly2$coefficients[1]
b1 <- modelo.poly2$coefficients[2]
b2 <- modelo.poly2$coefficients[3]
b0; b1; b2
## (Intercept) 
##    5.995662
## poly(x = TV, degree = 2, raw = TRUE)1 
##                            0.06958871
## poly(x = TV, degree = 2, raw = TRUE)2 
##                         -7.727362e-05

Coeficientes con el modelo I

bI0 <- modelo.poly2$coefficients[1]
bI1 <- modelo.poly2$coefficients[2]
bI2 <- modelo.poly2$coefficients[3]
bI0; bI1; bI2
## (Intercept) 
##    5.995662
## poly(x = TV, degree = 2, raw = TRUE)1 
##                            0.06958871
## poly(x = TV, degree = 2, raw = TRUE)2 
##                         -7.727362e-05

Valor de R Square

resumen$r.squared
## [1] 0.6113084

Predicciones del modelo

predicciones <- predict(object = modelo.poly2, newdata = datos.validacion)

Curva de estimación

ggplot(data = datos.entrenamiento, aes(x = TV, y = Sales)) + 
  geom_point(colour = "blue") + 
  geom_line(aes(x = TV, y = modelo.poly2$fitted.values, colour = 'red'))

Generar tabla comparativa

tabla <- data.frame(TV = datos.validacion$TV, Sales.real = datos.validacion$Sales, Sales.predicciones = predicciones)
tabla
##        TV Sales.real Sales.predicciones
## 10  199.8       10.6          16.814721
## 33   97.2        9.6          12.029616
## 37  266.9       25.4          19.064256
## 49  227.2       14.8          17.817365
## 66   69.0        9.3          10.429383
## 67   31.5        9.5           8.111032
## 70  216.8       22.3          17.450462
## 76   16.9        8.7           7.149641
## 78  120.5       14.2          13.259070
## 83   75.3       11.3          10.797544
## 93  217.7       19.4          17.482874
## 95  107.4       11.5          12.578157
## 96  163.3       16.9          15.298852
## 101 222.4       11.7          17.650102
## 105 238.2       20.7          18.187247
## 107  25.0        7.2           7.687084
## 109  13.1        5.3           6.894013
## 114 209.6       15.9          17.186659
## 124 123.1       15.2          13.391058
## 125 229.5       19.7          17.896251
## 128  80.2        8.8          11.079650
## 130  59.6        9.7           9.868661
## 133   8.4        5.7           6.574755
## 134 219.8       19.6          17.558015
## 136  48.3       11.6           9.176526
## 140 184.9       20.7          16.220784
## 143 220.5       20.1          17.582910
## 145  96.2       11.4          11.974972
## 149  38.0       10.9           8.528450
## 153 197.6       16.6          16.729184
## 168 206.8       12.2          17.081905
## 169 215.4       17.1          17.399794
## 171  50.0        8.4           9.281914
## 172 164.5       14.5          15.351962
## 179 276.7       11.8          19.334567
## 188 191.1       17.3          16.472093
## 191  39.5       10.8           8.623850
## 199 283.6       25.5          19.515984

Evaluar predicciones con rmse

rmse.poly2 <- rmse(actual = tabla$Sales.real, predicted = tabla$Sales.predicciones)
rmse.poly2
## [1] 3.060548

Modelo de regresión polinomial de quinto nivel

Usando argumento Poly

modelo.poly5 <- lm(data = datos.entrenamiento, formula = Sales ~ poly(x = TV, degree = 5, raw = TRUE))

Resumen del modelo

resumen <- summary(modelo.poly5)
resumen
## 
## Call:
## lm(formula = Sales ~ poly(x = TV, degree = 5, raw = TRUE), data = datos.entrenamiento)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.8165 -1.8804 -0.3115  1.8008  7.7906 
## 
## Coefficients:
##                                         Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                            3.225e+00  1.512e+00   2.133  0.03448 * 
## poly(x = TV, degree = 5, raw = TRUE)1  2.931e-01  1.108e-01   2.646  0.00898 **
## poly(x = TV, degree = 5, raw = TRUE)2 -4.528e-03  2.325e-03  -1.948  0.05325 . 
## poly(x = TV, degree = 5, raw = TRUE)3  3.583e-05  1.976e-05   1.814  0.07165 . 
## poly(x = TV, degree = 5, raw = TRUE)4 -1.268e-07  7.298e-08  -1.738  0.08427 . 
## poly(x = TV, degree = 5, raw = TRUE)5  1.638e-10  9.736e-11   1.682  0.09450 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.26 on 156 degrees of freedom
## Multiple R-squared:  0.6229, Adjusted R-squared:  0.6108 
## F-statistic: 51.54 on 5 and 156 DF,  p-value: < 2.2e-16

El valor de los coeficientes con modelo Poly

b0 <- modelo.poly5$coefficients[1]
b1 <- modelo.poly5$coefficients[2]
b2 <- modelo.poly5$coefficients[3]
b3 <- modelo.poly5$coefficients[4]
b4 <- modelo.poly5$coefficients[5]
b5 <- modelo.poly5$coefficients[6]
b0; b1; b2; b3; b4; b5
## (Intercept) 
##    3.224586
## poly(x = TV, degree = 5, raw = TRUE)1 
##                             0.2930818
## poly(x = TV, degree = 5, raw = TRUE)2 
##                          -0.004527839
## poly(x = TV, degree = 5, raw = TRUE)3 
##                          3.583497e-05
## poly(x = TV, degree = 5, raw = TRUE)4 
##                         -1.268014e-07
## poly(x = TV, degree = 5, raw = TRUE)5 
##                          1.637938e-10

Valor de R Square

resumen$r.squared
## [1] 0.6229085

Predicciones del modelo

predicciones <- predict(object = modelo.poly5, newdata = datos.validacion)

Curva de estimación

ggplot(data = datos.entrenamiento, aes(x = TV, y = Sales)) + 
  geom_point(colour = "blue") + 
  geom_line(aes(x = TV, y = modelo.poly5$fitted.values, colour = 'red'))

Generar tabla comparativa

tabla <- data.frame(TV = datos.validacion$TV, Sales.real = datos.validacion$Sales, Sales.predicciones = predicciones)
tabla
##        TV Sales.real Sales.predicciones
## 10  199.8       10.6          16.932000
## 33   97.2        9.6          11.944783
## 37  266.9       25.4          18.612474
## 49  227.2       14.8          17.644641
## 66   69.0        9.3          11.044258
## 67   31.5        9.5           8.964205
## 70  216.8       22.3          17.426810
## 76   16.9        8.7           7.047323
## 78  120.5       14.2          12.922519
## 83   75.3       11.3          11.240286
## 93  217.7       19.4          17.447809
## 95  107.4       11.5          12.337137
## 96  163.3       16.9          15.241535
## 101 222.4       11.7          17.550383
## 105 238.2       20.7          17.838357
## 107  25.0        7.2           8.233721
## 109  13.1        5.3           6.363824
## 114 209.6       15.9          17.241007
## 124 123.1       15.2          13.049178
## 125 229.5       19.7          17.686906
## 128  80.2        8.8          11.389508
## 130  59.6        9.7          10.718448
## 133   8.4        5.7           5.387604
## 134 219.8       19.6          17.495055
## 136  48.3       11.6          10.208277
## 140 184.9       20.7          16.333761
## 143 220.5       20.1          17.510282
## 145  96.2       11.4          11.909139
## 149  38.0       10.9           9.538412
## 153 197.6       16.6          16.853300
## 168 206.8       12.2          17.159566
## 169 215.4       17.1          17.393209
## 171  50.0        8.4          10.297128
## 172 164.5       14.5          15.307068
## 179 276.7       11.8          19.194541
## 188 191.1       17.3          16.600937
## 191  39.5       10.8           9.652329
## 199 283.6       25.5          19.790535

Evaluar predicciones con rmse

rmse.poly5 <- rmse(actual = tabla$Sales.real, predicted = tabla$Sales.predicciones)
rmse.poly5
## [1] 3.051003

Interpretación

modelos <- c("Lineal Simple", "Polinomial Segundo nivel", "Polinomial Quinto nivel")
rmse <- c(c(rmse.lineal, rmse.poly2, rmse.poly5))
comparativo.rmse <- data.frame(modelos, rmse)
kable(x = comparativo.rmse, caption = "Comparativo con rmse", )
Comparativo con rmse
modelos rmse
Lineal Simple 3.042904
Polinomial Segundo nivel 3.060548
Polinomial Quinto nivel 3.051003

Usando el 20% de los datos para el entrenamiento de los modelos, llegué a la conclusión de que el modelo de regresión lineal simple es un poco más efectivo que los demás, seguido del polinomial de quinto nivel y por último el polinomial de segundo nivel.