library(readr)
# install.packages("corrplot") # Nuevo
library(corrplot) # Para correlación
library(caret) # Para dividir conjunto de datos
# install.packages("MASS") # NUEVO
library(MASS)
#datos <- read.csv("~/Mis clases ITD/Semestre Enero Junio 2020/Analisis Inteligente de Datos/datos/auto-mpg.csv")
datos <- read.csv("https://raw.githubusercontent.com/rpizarrog/Curso-Titulacion-Data-Science-/master/2020/datos/auto-mpg.csv")
head(datos) # Los primeros seis
## No mpg cylinders displacement horsepower weight acceleration model_year
## 1 1 28 4 140 90 2264 15.5 71
## 2 2 19 3 70 97 2330 13.5 72
## 3 3 36 4 107 75 2205 14.5 82
## 4 4 28 4 97 92 2288 17.0 72
## 5 5 21 6 199 90 2648 15.0 70
## 6 6 23 4 115 95 2694 15.0 75
## car_name
## 1 chevrolet vega 2300
## 2 mazda rx2 coupe
## 3 honda accord
## 4 datsun 510 (sw)
## 5 amc gremlin
## 6 audi 100ls
tail(datos) # Los últimos seis
## No mpg cylinders displacement horsepower weight acceleration model_year
## 393 393 13.0 8 350 155 4502 13.5 72
## 394 394 16.5 6 168 120 3820 16.7 76
## 395 395 34.5 4 105 70 2150 14.9 79
## 396 396 38.1 4 89 60 1968 18.8 80
## 397 397 30.5 4 98 63 2051 17.0 77
## 398 398 19.0 6 232 100 2634 13.0 71
## car_name
## 393 buick lesabre custom
## 394 mercedes-benz 280s
## 395 plymouth horizon tc3
## 396 toyota corolla tercel
## 397 chevrolet chevette
## 398 amc gremlin
summary(datos) # Antes de categorizar o factor
## No mpg cylinders displacement
## Min. : 1.0 Min. : 9.00 Min. :3.000 Min. : 68.0
## 1st Qu.:100.2 1st Qu.:17.50 1st Qu.:4.000 1st Qu.:104.2
## Median :199.5 Median :23.00 Median :4.000 Median :148.5
## Mean :199.5 Mean :23.51 Mean :5.455 Mean :193.4
## 3rd Qu.:298.8 3rd Qu.:29.00 3rd Qu.:8.000 3rd Qu.:262.0
## Max. :398.0 Max. :46.60 Max. :8.000 Max. :455.0
##
## horsepower weight acceleration model_year
## Min. : 46.0 Min. :1613 Min. : 8.00 Min. :70.00
## 1st Qu.: 76.0 1st Qu.:2224 1st Qu.:13.82 1st Qu.:73.00
## Median : 92.0 Median :2804 Median :15.50 Median :76.00
## Mean :104.1 Mean :2970 Mean :15.57 Mean :76.01
## 3rd Qu.:125.0 3rd Qu.:3608 3rd Qu.:17.18 3rd Qu.:79.00
## Max. :230.0 Max. :5140 Max. :24.80 Max. :82.00
##
## car_name
## ford pinto : 6
## amc matador : 5
## ford maverick : 5
## toyota corolla: 5
## amc gremlin : 4
## amc hornet : 4
## (Other) :369
datos$cylinders <- factor(datos$cylinders, levels = c(3,4,5,6,8),labels = c('3c', '4c','5c','6c','8c'))
summary(datos) # Después de categorizar o factor
## No mpg cylinders displacement horsepower
## Min. : 1.0 Min. : 9.00 3c: 4 Min. : 68.0 Min. : 46.0
## 1st Qu.:100.2 1st Qu.:17.50 4c:204 1st Qu.:104.2 1st Qu.: 76.0
## Median :199.5 Median :23.00 5c: 3 Median :148.5 Median : 92.0
## Mean :199.5 Mean :23.51 6c: 84 Mean :193.4 Mean :104.1
## 3rd Qu.:298.8 3rd Qu.:29.00 8c:103 3rd Qu.:262.0 3rd Qu.:125.0
## Max. :398.0 Max. :46.60 Max. :455.0 Max. :230.0
##
## weight acceleration model_year car_name
## Min. :1613 Min. : 8.00 Min. :70.00 ford pinto : 6
## 1st Qu.:2224 1st Qu.:13.82 1st Qu.:73.00 amc matador : 5
## Median :2804 Median :15.50 Median :76.00 ford maverick : 5
## Mean :2970 Mean :15.57 Mean :76.01 toyota corolla: 5
## 3rd Qu.:3608 3rd Qu.:17.18 3rd Qu.:79.00 amc gremlin : 4
## Max. :5140 Max. :24.80 Max. :82.00 amc hornet : 4
## (Other) :369
cor(x=datos[,-c(1,3, 8,9)], method = "pearson")
## mpg displacement horsepower weight acceleration
## mpg 1.0000000 -0.8042028 -0.7756163 -0.8317409 0.4202889
## displacement -0.8042028 1.0000000 0.8975294 0.9328241 -0.5436841
## horsepower -0.7756163 0.8975294 1.0000000 0.8636062 -0.6880550
## weight -0.8317409 0.9328241 0.8636062 1.0000000 -0.4174573
## acceleration 0.4202889 -0.5436841 -0.6880550 -0.4174573 1.0000000
pairs(x=datos[,-c(1,3,8,9)], lower.panel = NULL)
corrplot(corr = cor(x=datos[,-c(1,3,8,9)], method = "pearson"), method = "number")
set.seed(2018)
entrena <- createDataPartition(datos$mpg, p=0.7, list = FALSE)
# entrena
nrow(entrena) # Cuantos datos de entrenamiento
## [1] 280
datosentrenamiento <- datos[entrena, -c(1,8,9)]
datosvalidacion <- datos[-entrena, -c(1,8,9)]
# Veremos que no son los mis datos los de entrenamiento y los datos de validación
head(datos)
## No mpg cylinders displacement horsepower weight acceleration model_year
## 1 1 28 4c 140 90 2264 15.5 71
## 2 2 19 3c 70 97 2330 13.5 72
## 3 3 36 4c 107 75 2205 14.5 82
## 4 4 28 4c 97 92 2288 17.0 72
## 5 5 21 6c 199 90 2648 15.0 70
## 6 6 23 4c 115 95 2694 15.0 75
## car_name
## 1 chevrolet vega 2300
## 2 mazda rx2 coupe
## 3 honda accord
## 4 datsun 510 (sw)
## 5 amc gremlin
## 6 audi 100ls
head(datosentrenamiento)
## mpg cylinders displacement horsepower weight acceleration
## 3 36.0 4c 107 75 2205 14.5
## 4 28.0 4c 97 92 2288 17.0
## 5 21.0 6c 199 90 2648 15.0
## 6 23.0 4c 115 95 2694 15.0
## 7 15.5 8c 304 120 3962 13.9
## 8 32.9 4c 119 100 2615 14.8
head(datosvalidacion)
## mpg cylinders displacement horsepower weight acceleration
## 1 28.0 4c 140 90 2264 15.5
## 2 19.0 3c 70 97 2330 13.5
## 11 12.0 8c 429 198 4952 11.5
## 13 13.0 8c 302 130 3870 15.0
## 14 27.9 4c 156 105 2800 14.4
## 18 14.0 8c 400 175 4385 12.0
modelo <- lm(mpg ~ ., data = datosentrenamiento)
modelo
##
## Call:
## lm(formula = mpg ~ ., data = datosentrenamiento)
##
## Coefficients:
## (Intercept) cylinders4c cylinders5c cylinders6c cylinders8c
## 37.284202 6.231475 8.248195 2.131026 4.568171
## displacement horsepower weight acceleration
## 0.002245 -0.057543 -0.004665 0.050745
summary(modelo)
##
## Call:
## lm(formula = mpg ~ ., data = datosentrenamiento)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.0606 -2.4686 -0.4435 1.9821 16.0907
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.2842024 3.6497412 10.216 < 2e-16 ***
## cylinders4c 6.2314753 2.4926855 2.500 0.01301 *
## cylinders5c 8.2481946 3.8091396 2.165 0.03123 *
## cylinders6c 2.1310256 2.7759570 0.768 0.44335
## cylinders8c 4.5681710 3.2054454 1.425 0.15527
## displacement 0.0022449 0.0108924 0.206 0.83687
## horsepower -0.0575428 0.0202773 -2.838 0.00489 **
## weight -0.0046652 0.0009999 -4.665 4.84e-06 ***
## acceleration 0.0507454 0.1443575 0.352 0.72547
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.092 on 271 degrees of freedom
## Multiple R-squared: 0.7304, Adjusted R-squared: 0.7224
## F-statistic: 91.75 on 8 and 271 DF, p-value: < 2.2e-16
boxplot(modelo$residuals)
sqrt(mean((modelo$fitted.values - datosentrenamiento$mpg) ^ 2))
## [1] 4.026021
mpg_prediccion <- predict(modelo, datosvalidacion)
sqrt(mean((mpg_prediccion - datosvalidacion$mpg) ^ 2))
## [1] 3.894627
cyl = c(4,5)
dis = c(80,100)
hp = c(70,90)
wei = c(2000, 2100)
ace = c(14, 16)
nuevosdatos <- data.frame(cylinders=cyl, displacement=dis,horsepower=hp, weight=wei, acceleration = ace)
nuevosdatos
## cylinders displacement horsepower weight acceleration
## 1 4 80 70 2000 14
## 2 5 100 90 2100 16
# Directo
38.607311983 + 7.212652193 + 0.006877506 * 80 + -0.072208661 *70 + -0.005155968 * 2000 + 0.024851517 * 14
## [1] 31.35154
# PREdecir para cuatro cylndros
modelo$coefficients
## (Intercept) cylinders4c cylinders5c cylinders6c cylinders8c displacement
## 37.284202446 6.231475322 8.248194569 2.131025643 4.568170956 0.002244864
## horsepower weight acceleration
## -0.057542812 -0.004665209 0.050745431
mpg.predict4 = modelo$coefficients[1] +
modelo$coefficients[2] * 1 +
modelo$coefficients[6] * dis[1] +
modelo$coefficients[7] * hp[1] +
modelo$coefficients[8] * wei[1] +
modelo$coefficients[9] * ace[1]
mpg.predict4
## (Intercept)
## 31.04729
# Predecir para cinco cilindros
modelo$coefficients
## (Intercept) cylinders4c cylinders5c cylinders6c cylinders8c displacement
## 37.284202446 6.231475322 8.248194569 2.131025643 4.568170956 0.002244864
## horsepower weight acceleration
## -0.057542812 -0.004665209 0.050745431
mpg.predict5 = modelo$coefficients[1] +
modelo$coefficients[3] * 1 +
modelo$coefficients[6] * dis[2] +
modelo$coefficients[7] * hp[2] +
modelo$coefficients[8] * wei[2] +
modelo$coefficients[9] * ace[2]
mpg.predict5
## (Intercept)
## 31.59302
# Pendiente, algo me falta.....
nuevosdatos$cylinders <- factor(nuevosdatos$cylinders, levels = c(4,5),labels = c('4c','5c'))
mpg.predict <- predict(modelo, nuevosdatos)
mpg.predict
## 1 2
## 31.04729 31.59302
par(mfrow=c(2,2))
plot(modelo)
* Interpretación.