Regresión lineal mútiple de autos

Las librerías

library(readr)
# install.packages("corrplot") # Nuevo
library(corrplot) # Para correlación
library(caret)  # Para dividir conjunto de datos

# install.packages("MASS") # NUEVO
library(MASS)

Los datos

#datos <- read.csv("~/Mis clases ITD/Semestre Enero Junio 2020/Analisis Inteligente de Datos/datos/auto-mpg.csv")
datos <- read.csv("https://raw.githubusercontent.com/rpizarrog/Curso-Titulacion-Data-Science-/master/2020/datos/auto-mpg.csv")
head(datos) # Los primeros seis
##   No mpg cylinders displacement horsepower weight acceleration model_year
## 1  1  28         4          140         90   2264         15.5         71
## 2  2  19         3           70         97   2330         13.5         72
## 3  3  36         4          107         75   2205         14.5         82
## 4  4  28         4           97         92   2288         17.0         72
## 5  5  21         6          199         90   2648         15.0         70
## 6  6  23         4          115         95   2694         15.0         75
##              car_name
## 1 chevrolet vega 2300
## 2     mazda rx2 coupe
## 3        honda accord
## 4     datsun 510 (sw)
## 5         amc gremlin
## 6          audi 100ls
tail(datos) # Los últimos seis
##      No  mpg cylinders displacement horsepower weight acceleration model_year
## 393 393 13.0         8          350        155   4502         13.5         72
## 394 394 16.5         6          168        120   3820         16.7         76
## 395 395 34.5         4          105         70   2150         14.9         79
## 396 396 38.1         4           89         60   1968         18.8         80
## 397 397 30.5         4           98         63   2051         17.0         77
## 398 398 19.0         6          232        100   2634         13.0         71
##                  car_name
## 393  buick lesabre custom
## 394    mercedes-benz 280s
## 395  plymouth horizon tc3
## 396 toyota corolla tercel
## 397    chevrolet chevette
## 398           amc gremlin
summary(datos) # Antes de categorizar o factor
##        No             mpg          cylinders      displacement  
##  Min.   :  1.0   Min.   : 9.00   Min.   :3.000   Min.   : 68.0  
##  1st Qu.:100.2   1st Qu.:17.50   1st Qu.:4.000   1st Qu.:104.2  
##  Median :199.5   Median :23.00   Median :4.000   Median :148.5  
##  Mean   :199.5   Mean   :23.51   Mean   :5.455   Mean   :193.4  
##  3rd Qu.:298.8   3rd Qu.:29.00   3rd Qu.:8.000   3rd Qu.:262.0  
##  Max.   :398.0   Max.   :46.60   Max.   :8.000   Max.   :455.0  
##                                                                 
##    horsepower        weight      acceleration     model_year   
##  Min.   : 46.0   Min.   :1613   Min.   : 8.00   Min.   :70.00  
##  1st Qu.: 76.0   1st Qu.:2224   1st Qu.:13.82   1st Qu.:73.00  
##  Median : 92.0   Median :2804   Median :15.50   Median :76.00  
##  Mean   :104.1   Mean   :2970   Mean   :15.57   Mean   :76.01  
##  3rd Qu.:125.0   3rd Qu.:3608   3rd Qu.:17.18   3rd Qu.:79.00  
##  Max.   :230.0   Max.   :5140   Max.   :24.80   Max.   :82.00  
##                                                                
##            car_name  
##  ford pinto    :  6  
##  amc matador   :  5  
##  ford maverick :  5  
##  toyota corolla:  5  
##  amc gremlin   :  4  
##  amc hornet    :  4  
##  (Other)       :369

Convertir la variable categórica de Cilindros a factor

datos$cylinders <- factor(datos$cylinders, levels = c(3,4,5,6,8),labels = c('3c', '4c','5c','6c','8c'))

summary(datos) # Después de categorizar o factor
##        No             mpg        cylinders  displacement     horsepower   
##  Min.   :  1.0   Min.   : 9.00   3c:  4    Min.   : 68.0   Min.   : 46.0  
##  1st Qu.:100.2   1st Qu.:17.50   4c:204    1st Qu.:104.2   1st Qu.: 76.0  
##  Median :199.5   Median :23.00   5c:  3    Median :148.5   Median : 92.0  
##  Mean   :199.5   Mean   :23.51   6c: 84    Mean   :193.4   Mean   :104.1  
##  3rd Qu.:298.8   3rd Qu.:29.00   8c:103    3rd Qu.:262.0   3rd Qu.:125.0  
##  Max.   :398.0   Max.   :46.60             Max.   :455.0   Max.   :230.0  
##                                                                           
##      weight      acceleration     model_year              car_name  
##  Min.   :1613   Min.   : 8.00   Min.   :70.00   ford pinto    :  6  
##  1st Qu.:2224   1st Qu.:13.82   1st Qu.:73.00   amc matador   :  5  
##  Median :2804   Median :15.50   Median :76.00   ford maverick :  5  
##  Mean   :2970   Mean   :15.57   Mean   :76.01   toyota corolla:  5  
##  3rd Qu.:3608   3rd Qu.:17.18   3rd Qu.:79.00   amc gremlin   :  4  
##  Max.   :5140   Max.   :24.80   Max.   :82.00   amc hornet    :  4  
##                                                 (Other)       :369

Tabla de correlación

cor(x=datos[,-c(1,3, 8,9)], method = "pearson")
##                     mpg displacement horsepower     weight acceleration
## mpg           1.0000000   -0.8042028 -0.7756163 -0.8317409    0.4202889
## displacement -0.8042028    1.0000000  0.8975294  0.9328241   -0.5436841
## horsepower   -0.7756163    0.8975294  1.0000000  0.8636062   -0.6880550
## weight       -0.8317409    0.9328241  0.8636062  1.0000000   -0.4174573
## acceleration  0.4202889   -0.5436841 -0.6880550 -0.4174573    1.0000000

Mostrando correlaciones con pairs()

pairs(x=datos[,-c(1,3,8,9)], lower.panel = NULL)

Correlación con corrplot()

corrplot(corr = cor(x=datos[,-c(1,3,8,9)], method = "pearson"), method = "number")

Dividir conjunto de entrenamiento y datos de validación o prueba

set.seed(2018)
entrena <- createDataPartition(datos$mpg, p=0.7, list = FALSE)
# entrena

nrow(entrena) # Cuantos datos de entrenamiento
## [1] 280

Generando el modelo lineal con los datos de entrenamiento

datosentrenamiento <- datos[entrena, -c(1,8,9)]

datosvalidacion <- datos[-entrena, -c(1,8,9)]

# Veremos que no son los mis datos los de entrenamiento y los datos de validación
head(datos)
##   No mpg cylinders displacement horsepower weight acceleration model_year
## 1  1  28        4c          140         90   2264         15.5         71
## 2  2  19        3c           70         97   2330         13.5         72
## 3  3  36        4c          107         75   2205         14.5         82
## 4  4  28        4c           97         92   2288         17.0         72
## 5  5  21        6c          199         90   2648         15.0         70
## 6  6  23        4c          115         95   2694         15.0         75
##              car_name
## 1 chevrolet vega 2300
## 2     mazda rx2 coupe
## 3        honda accord
## 4     datsun 510 (sw)
## 5         amc gremlin
## 6          audi 100ls
head(datosentrenamiento)
##    mpg cylinders displacement horsepower weight acceleration
## 3 36.0        4c          107         75   2205         14.5
## 4 28.0        4c           97         92   2288         17.0
## 5 21.0        6c          199         90   2648         15.0
## 6 23.0        4c          115         95   2694         15.0
## 7 15.5        8c          304        120   3962         13.9
## 8 32.9        4c          119        100   2615         14.8
head(datosvalidacion)
##     mpg cylinders displacement horsepower weight acceleration
## 1  28.0        4c          140         90   2264         15.5
## 2  19.0        3c           70         97   2330         13.5
## 11 12.0        8c          429        198   4952         11.5
## 13 13.0        8c          302        130   3870         15.0
## 14 27.9        4c          156        105   2800         14.4
## 18 14.0        8c          400        175   4385         12.0
modelo <- lm(mpg ~ ., data = datosentrenamiento)

modelo
## 
## Call:
## lm(formula = mpg ~ ., data = datosentrenamiento)
## 
## Coefficients:
##  (Intercept)   cylinders4c   cylinders5c   cylinders6c   cylinders8c  
##    37.284202      6.231475      8.248195      2.131026      4.568171  
## displacement    horsepower        weight  acceleration  
##     0.002245     -0.057543     -0.004665      0.050745

Summary(modelo)

summary(modelo)
## 
## Call:
## lm(formula = mpg ~ ., data = datosentrenamiento)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.0606  -2.4686  -0.4435   1.9821  16.0907 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  37.2842024  3.6497412  10.216  < 2e-16 ***
## cylinders4c   6.2314753  2.4926855   2.500  0.01301 *  
## cylinders5c   8.2481946  3.8091396   2.165  0.03123 *  
## cylinders6c   2.1310256  2.7759570   0.768  0.44335    
## cylinders8c   4.5681710  3.2054454   1.425  0.15527    
## displacement  0.0022449  0.0108924   0.206  0.83687    
## horsepower   -0.0575428  0.0202773  -2.838  0.00489 ** 
## weight       -0.0046652  0.0009999  -4.665 4.84e-06 ***
## acceleration  0.0507454  0.1443575   0.352  0.72547    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.092 on 271 degrees of freedom
## Multiple R-squared:  0.7304, Adjusted R-squared:  0.7224 
## F-statistic: 91.75 on 8 and 271 DF,  p-value: < 2.2e-16
boxplot(modelo$residuals)

Error cuadrático medio del modelo en los datos de entrenamiento

sqrt(mean((modelo$fitted.values - datosentrenamiento$mpg) ^ 2))
## [1] 4.026021

Evaluar con los datos de validación que no serán los de entrenamiento

mpg_prediccion <- predict(modelo, datosvalidacion) 

Error cuadrático medio del modelo en los datos de validación

sqrt(mean((mpg_prediccion - datosvalidacion$mpg) ^ 2))
## [1] 3.894627

PRedecir dos nuevos autos con ciertas características

cyl = c(4,5)
dis = c(80,100)
hp = c(70,90)
wei = c(2000, 2100)
ace = c(14, 16)

nuevosdatos <- data.frame(cylinders=cyl, displacement=dis,horsepower=hp, weight=wei, acceleration = ace)
nuevosdatos
##   cylinders displacement horsepower weight acceleration
## 1         4           80         70   2000           14
## 2         5          100         90   2100           16

Predecir por la Fórmula

# Directo
38.607311983 + 7.212652193 + 0.006877506 * 80 + -0.072208661 *70 + -0.005155968 * 2000 + 0.024851517 * 14
## [1] 31.35154
# PREdecir para cuatro cylndros
modelo$coefficients
##  (Intercept)  cylinders4c  cylinders5c  cylinders6c  cylinders8c displacement 
## 37.284202446  6.231475322  8.248194569  2.131025643  4.568170956  0.002244864 
##   horsepower       weight acceleration 
## -0.057542812 -0.004665209  0.050745431
mpg.predict4 = modelo$coefficients[1] + 
              modelo$coefficients[2] * 1 +
              modelo$coefficients[6] * dis[1] + 
              modelo$coefficients[7] * hp[1] + 
              modelo$coefficients[8] * wei[1] + 
              modelo$coefficients[9] * ace[1]
mpg.predict4
## (Intercept) 
##    31.04729
# Predecir para cinco cilindros
modelo$coefficients
##  (Intercept)  cylinders4c  cylinders5c  cylinders6c  cylinders8c displacement 
## 37.284202446  6.231475322  8.248194569  2.131025643  4.568170956  0.002244864 
##   horsepower       weight acceleration 
## -0.057542812 -0.004665209  0.050745431
mpg.predict5 = modelo$coefficients[1] + 
              modelo$coefficients[3] * 1 +
              modelo$coefficients[6] * dis[2] + 
              modelo$coefficients[7] * hp[2] + 
              modelo$coefficients[8] * wei[2] + 
              modelo$coefficients[9] * ace[2]
mpg.predict5
## (Intercept) 
##    31.59302

Predecir por la función predict

# Pendiente, algo me falta..... 
nuevosdatos$cylinders <- factor(nuevosdatos$cylinders, levels = c(4,5),labels = c('4c','5c'))
mpg.predict <- predict(modelo, nuevosdatos)
mpg.predict
##        1        2 
## 31.04729 31.59302

Gráficas varias

par(mfrow=c(2,2))
plot(modelo)

* Interpretación.