library(lattice)
library(readr) # Sirve para importar datos
library(Metrics) # Sirve para construir métricas y valorar modelos
library (ggplot2) # Sirve para gráficos
library(caret) # Para partir los datos Entrenamiento y Validación.
##
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
##
## precision, recall
library(knitr)
datos <- read.csv("https://raw.githubusercontent.com/rpizarrog/Analisis-Inteligente-de-datos/main/datos/Advertising_Web.csv")
summary(datos)
## X.1 X TV Radio
## Min. : 1.00 Min. : 1.00 Min. : 0.70 Min. : 0.000
## 1st Qu.: 50.75 1st Qu.: 50.75 1st Qu.: 74.38 1st Qu.: 9.975
## Median :100.50 Median :100.50 Median :149.75 Median :22.900
## Mean :100.50 Mean :100.50 Mean :147.04 Mean :23.264
## 3rd Qu.:150.25 3rd Qu.:150.25 3rd Qu.:218.82 3rd Qu.:36.525
## Max. :200.00 Max. :200.00 Max. :296.40 Max. :49.600
## Newspaper Web Sales
## Min. : 0.30 Min. : 4.308 Min. : 1.60
## 1st Qu.: 12.75 1st Qu.: 99.049 1st Qu.:10.38
## Median : 25.75 Median :156.862 Median :12.90
## Mean : 30.55 Mean :159.587 Mean :14.02
## 3rd Qu.: 45.10 3rd Qu.:212.312 3rd Qu.:17.40
## Max. :114.00 Max. :358.247 Max. :27.00
ggplot(datos, aes(x = TV, y = Sales)) +
geom_point(colour = "blue") +
geom_smooth(colour = 'orange', method = lm) +
geom_smooth(colour = 'red')
## `geom_smooth()` using formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
set.seed(2022)
entrena <- createDataPartition(y = datos$Sales, p = 0.80, list = FALSE, times = 1)
# Datos entrenamiento
datos.entrenamiento <- datos[entrena, ] # [renglones, columna]
# Datos validación
datos.validacion <- datos[-entrena, ]
head(datos.entrenamiento, 10)
## X.1 X TV Radio Newspaper Web Sales
## 1 1 1 230.1 37.8 69.2 306.63475 22.1
## 2 2 2 44.5 39.3 45.1 302.65307 10.4
## 3 3 3 17.2 45.9 69.3 49.49891 9.3
## 4 4 4 151.5 41.3 58.5 257.81689 18.5
## 5 5 5 180.8 10.8 58.4 195.66008 12.9
## 6 6 6 8.7 48.9 75.0 22.07240 7.2
## 7 7 7 57.5 32.8 23.5 246.81160 11.8
## 8 8 8 120.2 19.6 11.6 229.97146 13.2
## 9 9 9 8.6 2.1 1.0 144.61739 4.8
## 11 11 11 66.1 5.8 24.2 45.35903 8.6
tail(datos.entrenamiento, 10)
## X.1 X TV Radio Newspaper Web Sales
## 189 189 189 286.0 13.9 3.7 151.99073 15.9
## 190 190 190 18.7 12.1 23.4 222.90695 6.7
## 192 192 192 75.5 10.8 6.0 301.48119 9.9
## 193 193 193 17.2 4.1 31.6 265.02864 5.9
## 194 194 194 166.8 42.0 3.6 192.24621 19.6
## 195 195 195 149.7 35.6 6.0 99.57998 17.3
## 196 196 196 38.2 3.7 13.8 248.84107 7.6
## 197 197 197 94.2 4.9 8.1 118.04186 9.7
## 198 198 198 177.0 9.3 6.4 213.27467 12.8
## 200 200 200 232.1 8.6 8.7 151.99073 13.4
head(datos.validacion, 10)
## X.1 X TV Radio Newspaper Web Sales
## 10 10 10 199.8 2.6 21.2 111.27226 10.6
## 33 33 33 97.2 1.5 30.0 139.78109 9.6
## 37 37 37 266.9 43.8 5.0 96.31683 25.4
## 49 49 49 227.2 15.8 49.9 75.26918 14.8
## 66 66 66 69.0 9.3 0.9 205.99349 9.3
## 67 67 67 31.5 24.6 2.2 216.47140 9.5
## 70 70 70 216.8 43.9 27.2 149.39610 22.3
## 76 76 76 16.9 43.7 89.4 70.23428 8.7
## 78 78 78 120.5 28.5 14.2 97.45513 14.2
## 83 83 83 75.3 20.3 32.5 231.20983 11.3
tail(datos.validacion, 10)
## X.1 X TV Radio Newspaper Web Sales
## 149 149 149 38.0 40.3 11.9 75.20798 10.9
## 153 153 153 197.6 23.3 14.2 159.52256 16.6
## 168 168 168 206.8 5.2 19.4 115.37196 12.2
## 169 169 169 215.4 23.6 57.6 203.43127 17.1
## 171 171 171 50.0 11.6 18.4 64.01480 8.4
## 172 172 172 164.5 20.9 47.4 96.18039 14.5
## 179 179 179 276.7 2.3 23.7 137.32377 11.8
## 188 188 188 191.1 28.7 18.2 239.27571 17.3
## 191 191 191 39.5 41.1 5.8 219.89058 10.8
## 199 199 199 283.6 42.0 66.2 237.49806 25.5
modelo.simple <- lm(data = datos.entrenamiento, formula = Sales ~ TV)
resumen <- summary(modelo.simple)
resumen
##
## Call:
## lm(formula = Sales ~ TV, data = datos.entrenamiento)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9573 -2.0042 -0.2716 2.0227 7.2679
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.060804 0.518127 13.63 <2e-16 ***
## TV 0.047116 0.003031 15.54 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.308 on 160 degrees of freedom
## Multiple R-squared: 0.6016, Adjusted R-squared: 0.5992
## F-statistic: 241.7 on 1 and 160 DF, p-value: < 2.2e-16
a <- modelo.simple$coefficients[1]
b <- modelo.simple$coefficients[2]
a; b
## (Intercept)
## 7.060804
## TV
## 0.04711566
resumen$r.squared
## [1] 0.6016483
predicciones <- predict(object = modelo.simple, newdata = datos.validacion)
ggplot(data = datos.entrenamiento, aes(x = TV, y = Sales)) +
geom_point(colour = "blue") +
geom_line(aes(x = TV, y = modelo.simple$fitted.values, colour = 'red'))
tabla <- data.frame(TV = datos.validacion$TV, Sales.real = datos.validacion$Sales, Sales.predicciones = predicciones)
tabla
## TV Sales.real Sales.predicciones
## 10 199.8 10.6 16.474512
## 33 97.2 9.6 11.640446
## 37 266.9 25.4 19.635973
## 49 227.2 14.8 17.765481
## 66 69.0 9.3 10.311784
## 67 31.5 9.5 8.544947
## 70 216.8 22.3 17.275478
## 76 16.9 8.7 7.857058
## 78 120.5 14.2 12.738240
## 83 75.3 11.3 10.608613
## 93 217.7 19.4 17.317882
## 95 107.4 11.5 12.121025
## 96 163.3 16.9 14.754791
## 101 222.4 11.7 17.539326
## 105 238.2 20.7 18.283753
## 107 25.0 7.2 8.238695
## 109 13.1 5.3 7.678019
## 114 209.6 15.9 16.936245
## 124 123.1 15.2 12.860741
## 125 229.5 19.7 17.873847
## 128 80.2 8.8 10.839479
## 130 59.6 9.7 9.868897
## 133 8.4 5.7 7.456575
## 134 219.8 19.6 17.416825
## 136 48.3 11.6 9.336490
## 140 184.9 20.7 15.772489
## 143 220.5 20.1 17.449806
## 145 96.2 11.4 11.593330
## 149 38.0 10.9 8.851199
## 153 197.6 16.6 16.370858
## 168 206.8 12.2 16.804322
## 169 215.4 17.1 17.209516
## 171 50.0 8.4 9.416587
## 172 164.5 14.5 14.811329
## 179 276.7 11.8 20.097706
## 188 191.1 17.3 16.064606
## 191 39.5 10.8 8.921872
## 199 283.6 25.5 20.422804
rmse.lineal <- rmse(actual = tabla$Sales.real, predicted = tabla$Sales.predicciones)
rmse.lineal
## [1] 3.042904
modelo.poly2 <- lm(data = datos.entrenamiento, formula = Sales ~ poly(x = TV, degree = 2, raw = TRUE))
modelo.poly2.I <- lm(Sales ~ TV + I(x = TV ^ 2), data = datos.entrenamiento)
resumen <- summary(modelo.poly2)
resumen
##
## Call:
## lm(formula = Sales ~ poly(x = TV, degree = 2, raw = TRUE), data = datos.entrenamiento)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.9497 -2.0174 -0.2908 1.9922 7.6601
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.996e+00 7.421e-01 8.079 1.53e-13
## poly(x = TV, degree = 2, raw = TRUE)1 6.959e-02 1.170e-02 5.949 1.66e-08
## poly(x = TV, degree = 2, raw = TRUE)2 -7.727e-05 3.887e-05 -1.988 0.0485
##
## (Intercept) ***
## poly(x = TV, degree = 2, raw = TRUE)1 ***
## poly(x = TV, degree = 2, raw = TRUE)2 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.278 on 159 degrees of freedom
## Multiple R-squared: 0.6113, Adjusted R-squared: 0.6064
## F-statistic: 125 on 2 and 159 DF, p-value: < 2.2e-16
b0 <- modelo.poly2$coefficients[1]
b1 <- modelo.poly2$coefficients[2]
b2 <- modelo.poly2$coefficients[3]
b0; b1; b2
## (Intercept)
## 5.995662
## poly(x = TV, degree = 2, raw = TRUE)1
## 0.06958871
## poly(x = TV, degree = 2, raw = TRUE)2
## -7.727362e-05
bI0 <- modelo.poly2$coefficients[1]
bI1 <- modelo.poly2$coefficients[2]
bI2 <- modelo.poly2$coefficients[3]
bI0; bI1; bI2
## (Intercept)
## 5.995662
## poly(x = TV, degree = 2, raw = TRUE)1
## 0.06958871
## poly(x = TV, degree = 2, raw = TRUE)2
## -7.727362e-05
resumen$r.squared
## [1] 0.6113084
predicciones <- predict(object = modelo.poly2, newdata = datos.validacion)
ggplot(data = datos.entrenamiento, aes(x = TV, y = Sales)) +
geom_point(colour = "blue") +
geom_line(aes(x = TV, y = modelo.poly2$fitted.values, colour = 'red'))
tabla <- data.frame(TV = datos.validacion$TV, Sales.real = datos.validacion$Sales, Sales.predicciones = predicciones)
tabla
## TV Sales.real Sales.predicciones
## 10 199.8 10.6 16.814721
## 33 97.2 9.6 12.029616
## 37 266.9 25.4 19.064256
## 49 227.2 14.8 17.817365
## 66 69.0 9.3 10.429383
## 67 31.5 9.5 8.111032
## 70 216.8 22.3 17.450462
## 76 16.9 8.7 7.149641
## 78 120.5 14.2 13.259070
## 83 75.3 11.3 10.797544
## 93 217.7 19.4 17.482874
## 95 107.4 11.5 12.578157
## 96 163.3 16.9 15.298852
## 101 222.4 11.7 17.650102
## 105 238.2 20.7 18.187247
## 107 25.0 7.2 7.687084
## 109 13.1 5.3 6.894013
## 114 209.6 15.9 17.186659
## 124 123.1 15.2 13.391058
## 125 229.5 19.7 17.896251
## 128 80.2 8.8 11.079650
## 130 59.6 9.7 9.868661
## 133 8.4 5.7 6.574755
## 134 219.8 19.6 17.558015
## 136 48.3 11.6 9.176526
## 140 184.9 20.7 16.220784
## 143 220.5 20.1 17.582910
## 145 96.2 11.4 11.974972
## 149 38.0 10.9 8.528450
## 153 197.6 16.6 16.729184
## 168 206.8 12.2 17.081905
## 169 215.4 17.1 17.399794
## 171 50.0 8.4 9.281914
## 172 164.5 14.5 15.351962
## 179 276.7 11.8 19.334567
## 188 191.1 17.3 16.472093
## 191 39.5 10.8 8.623850
## 199 283.6 25.5 19.515984
rmse.poly2 <- rmse(actual = tabla$Sales.real, predicted = tabla$Sales.predicciones)
rmse.poly2
## [1] 3.060548
modelo.poly5 <- lm(data = datos.entrenamiento, formula = Sales ~ poly(x = TV, degree = 5, raw = TRUE))
resumen <- summary(modelo.poly5)
resumen
##
## Call:
## lm(formula = Sales ~ poly(x = TV, degree = 5, raw = TRUE), data = datos.entrenamiento)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.8165 -1.8804 -0.3115 1.8008 7.7906
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.225e+00 1.512e+00 2.133 0.03448 *
## poly(x = TV, degree = 5, raw = TRUE)1 2.931e-01 1.108e-01 2.646 0.00898 **
## poly(x = TV, degree = 5, raw = TRUE)2 -4.528e-03 2.325e-03 -1.948 0.05325 .
## poly(x = TV, degree = 5, raw = TRUE)3 3.583e-05 1.976e-05 1.814 0.07165 .
## poly(x = TV, degree = 5, raw = TRUE)4 -1.268e-07 7.298e-08 -1.738 0.08427 .
## poly(x = TV, degree = 5, raw = TRUE)5 1.638e-10 9.736e-11 1.682 0.09450 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.26 on 156 degrees of freedom
## Multiple R-squared: 0.6229, Adjusted R-squared: 0.6108
## F-statistic: 51.54 on 5 and 156 DF, p-value: < 2.2e-16
b0 <- modelo.poly5$coefficients[1]
b1 <- modelo.poly5$coefficients[2]
b2 <- modelo.poly5$coefficients[3]
b3 <- modelo.poly5$coefficients[4]
b4 <- modelo.poly5$coefficients[5]
b5 <- modelo.poly5$coefficients[6]
b0; b1; b2; b3; b4; b5
## (Intercept)
## 3.224586
## poly(x = TV, degree = 5, raw = TRUE)1
## 0.2930818
## poly(x = TV, degree = 5, raw = TRUE)2
## -0.004527839
## poly(x = TV, degree = 5, raw = TRUE)3
## 3.583497e-05
## poly(x = TV, degree = 5, raw = TRUE)4
## -1.268014e-07
## poly(x = TV, degree = 5, raw = TRUE)5
## 1.637938e-10
resumen$r.squared
## [1] 0.6229085
predicciones <- predict(object = modelo.poly5, newdata = datos.validacion)
ggplot(data = datos.entrenamiento, aes(x = TV, y = Sales)) +
geom_point(colour = "blue") +
geom_line(aes(x = TV, y = modelo.poly5$fitted.values, colour = 'red'))
tabla <- data.frame(TV = datos.validacion$TV, Sales.real = datos.validacion$Sales, Sales.predicciones = predicciones)
tabla
## TV Sales.real Sales.predicciones
## 10 199.8 10.6 16.932000
## 33 97.2 9.6 11.944783
## 37 266.9 25.4 18.612474
## 49 227.2 14.8 17.644641
## 66 69.0 9.3 11.044258
## 67 31.5 9.5 8.964205
## 70 216.8 22.3 17.426810
## 76 16.9 8.7 7.047323
## 78 120.5 14.2 12.922519
## 83 75.3 11.3 11.240286
## 93 217.7 19.4 17.447809
## 95 107.4 11.5 12.337137
## 96 163.3 16.9 15.241535
## 101 222.4 11.7 17.550383
## 105 238.2 20.7 17.838357
## 107 25.0 7.2 8.233721
## 109 13.1 5.3 6.363824
## 114 209.6 15.9 17.241007
## 124 123.1 15.2 13.049178
## 125 229.5 19.7 17.686906
## 128 80.2 8.8 11.389508
## 130 59.6 9.7 10.718448
## 133 8.4 5.7 5.387604
## 134 219.8 19.6 17.495055
## 136 48.3 11.6 10.208277
## 140 184.9 20.7 16.333761
## 143 220.5 20.1 17.510282
## 145 96.2 11.4 11.909139
## 149 38.0 10.9 9.538412
## 153 197.6 16.6 16.853300
## 168 206.8 12.2 17.159566
## 169 215.4 17.1 17.393209
## 171 50.0 8.4 10.297128
## 172 164.5 14.5 15.307068
## 179 276.7 11.8 19.194541
## 188 191.1 17.3 16.600937
## 191 39.5 10.8 9.652329
## 199 283.6 25.5 19.790535
rmse.poly5 <- rmse(actual = tabla$Sales.real, predicted = tabla$Sales.predicciones)
rmse.poly5
## [1] 3.051003
modelos <- c("Lineal Simple", "Polinomial Segundo nivel", "Polinomial Quinto nivel")
rmse <- c(c(rmse.lineal, rmse.poly2, rmse.poly5))
comparativo.rmse <- data.frame(modelos, rmse)
kable(x = comparativo.rmse, caption = "Comparativo con rmse", )
| modelos | rmse |
|---|---|
| Lineal Simple | 3.042904 |
| Polinomial Segundo nivel | 3.060548 |
| Polinomial Quinto nivel | 3.051003 |
Usando el 20% de los datos para el entrenamiento de los modelos, llegué a la conclusión de que el modelo de regresión lineal simple es un poco más efectivo que los demás, seguido del polinomial de quinto nivel y por último el polinomial de segundo nivel.