rm(list=ls())
library(data.table)
library(ggplot2)
library(leaflet)
library(data.table)
library(gganimate)
library(ggcorrplot)
library(plotly)
library(janitor)
library(RColorBrewer)
library(tidyverse)
library(caret)
library(jtools)
library(scales)
library(lubridate)
library(corrgram)
library(corrplot)
library(knitr)
library(moderndive)
path <- 'C:/Users/Rorro Bielenberg/Desktop/UAI/Data Science/Tarea 5/'
carprice<- fread(paste0(path,'CarPrice_Assignment.csv'))
##Pregunta 1: Muestre un histograma de distribución del precio.(4 ptos)
ggplot(carprice,aes(price)) +
geom_histogram()
##Pregunta 2 Realice un gráfico de correlación entre las variables númericas, estas son, “wheelbase”, “carlength” “carwidth”,“carheight”, “curbweight”, “enginesize”,“boreratio”,“stroke”, “compressionratio”, “horsepower”, “peakrpm”,“citympg”, “highwaympg”,“price” (4 ptos)
variables <- carprice[,.(wheelbase, carlength, carwidth,carheight, curbweight, enginesize,boreratio,stroke, compressionratio, horsepower, peakrpm,citympg, highwaympg,price)]
corr <- cor(variables)
corr
## wheelbase carlength carwidth carheight curbweight
## wheelbase 1.0000000 0.8745875 0.7951436 0.58943476 0.7763863
## carlength 0.8745875 1.0000000 0.8411183 0.49102946 0.8777285
## carwidth 0.7951436 0.8411183 1.0000000 0.27921032 0.8670325
## carheight 0.5894348 0.4910295 0.2792103 1.00000000 0.2955717
## curbweight 0.7763863 0.8777285 0.8670325 0.29557173 1.0000000
## enginesize 0.5693287 0.6833599 0.7354334 0.06714874 0.8505941
## boreratio 0.4887499 0.6064544 0.5591499 0.17107092 0.6484797
## stroke 0.1609590 0.1295326 0.1829417 -0.05530667 0.1687900
## compressionratio 0.2497858 0.1584137 0.1811286 0.26121423 0.1513617
## horsepower 0.3532945 0.5526230 0.6407321 -0.10880206 0.7507393
## peakrpm -0.3604687 -0.2872422 -0.2200123 -0.32041072 -0.2662432
## citympg -0.4704136 -0.6709087 -0.6427043 -0.04863963 -0.7574138
## highwaympg -0.5440819 -0.7046616 -0.6772179 -0.10735763 -0.7974648
## price 0.5778156 0.6829200 0.7593253 0.11933623 0.8353049
## enginesize boreratio stroke compressionratio
## wheelbase 0.56932868 0.488749875 0.16095905 0.249785845
## carlength 0.68335987 0.606454358 0.12953261 0.158413706
## carwidth 0.73543340 0.559149909 0.18294169 0.181128627
## carheight 0.06714874 0.171070922 -0.05530667 0.261214226
## curbweight 0.85059407 0.648479749 0.16879004 0.151361740
## enginesize 1.00000000 0.583774327 0.20312859 0.028971360
## boreratio 0.58377433 1.000000000 -0.05590898 0.005197339
## stroke 0.20312859 -0.055908983 1.00000000 0.186110110
## compressionratio 0.02897136 0.005197339 0.18611011 1.000000000
## horsepower 0.80976865 0.573676823 0.08093954 -0.204326226
## peakrpm -0.24465983 -0.254975528 -0.06796375 -0.435740514
## citympg -0.65365792 -0.584531716 -0.04214475 0.324701425
## highwaympg -0.67746991 -0.587011784 -0.04393093 0.265201389
## price 0.87414480 0.553173237 0.07944308 0.067983506
## horsepower peakrpm citympg highwaympg price
## wheelbase 0.35329448 -0.36046875 -0.47041361 -0.54408192 0.57781560
## carlength 0.55262297 -0.28724220 -0.67090866 -0.70466160 0.68292002
## carwidth 0.64073208 -0.22001230 -0.64270434 -0.67721792 0.75932530
## carheight -0.10880206 -0.32041072 -0.04863963 -0.10735763 0.11933623
## curbweight 0.75073925 -0.26624318 -0.75741378 -0.79746479 0.83530488
## enginesize 0.80976865 -0.24465983 -0.65365792 -0.67746991 0.87414480
## boreratio 0.57367682 -0.25497553 -0.58453172 -0.58701178 0.55317324
## stroke 0.08093954 -0.06796375 -0.04214475 -0.04393093 0.07944308
## compressionratio -0.20432623 -0.43574051 0.32470142 0.26520139 0.06798351
## horsepower 1.00000000 0.13107251 -0.80145618 -0.77054389 0.80813882
## peakrpm 0.13107251 1.00000000 -0.11354438 -0.05427481 -0.08526715
## citympg -0.80145618 -0.11354438 1.00000000 0.97133704 -0.68575134
## highwaympg -0.77054389 -0.05427481 0.97133704 1.00000000 -0.69759909
## price 0.80813882 -0.08526715 -0.68575134 -0.69759909 1.00000000
ggcorrplot(corr,type = 'lower', lab = TRUE) +
labs(title = "Gráfico de correlación",x=NULL,y=NULL) +
theme_minimal() +
theme(legend.position="none")
##Pregunta 3 Realice dos gráficos de puntos, uno que relacione el precio price, el rendimiento en ciudad citympg y que se diferencie por tipo de tracción drivewheel , el segundo entre precio price, el rendimiento en carretera highwaympgy que se diferencie por tipo de tracción drivewheel (8 ptos)
ggplot(carprice,aes(x=price, y=citympg, color=drivewheel)) + geom_point() + scale_color_brewer(palette='Paired')
ggplot(carprice,aes(x=price, y=highwaympg, color=drivewheel)) + geom_point() + scale_color_brewer(palette='Paired')
##Pregunta 4 Realice una regresión múltiple entre precio con las variables enginetype y fuelsystem. ¿Qué podrian decir de la significacia de las variables?(5 ptos)
Calcule el predicho para cada observación.(2 ptos)
regresion <- lm(formula = price ~ enginetype + fuelsystem,
data = carprice)
sum1 <- summary(regresion)
sum1
##
## Call:
## lm(formula = price ~ enginetype + fuelsystem, data = carprice)
##
## Residuals:
## Min 1Q Median 3Q Max
## -11599.4 -2871.7 -399.4 1660.6 25431.8
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9788.7 2682.8 3.649 0.000340 ***
## enginetypedohcv 13284.1 6391.2 2.078 0.039002 *
## enginetypel -2804.7 2602.7 -1.078 0.282566
## enginetypeohc -2233.2 1941.6 -1.150 0.251503
## enginetypeohcf 261.6 2446.3 0.107 0.914966
## enginetypeohcv 6982.0 2458.2 2.840 0.004996 **
## enginetyperotor -2471.4 6391.2 -0.387 0.699418
## fuelsystem2bbl -371.1 2010.6 -0.185 0.853751
## fuelsystem4bbl 4827.7 7370.9 0.655 0.513276
## fuelsystemidi 8425.5 2354.8 3.578 0.000439 ***
## fuelsystemmfi 5408.5 6413.6 0.843 0.400124
## fuelsystemmpfi 8327.7 2013.8 4.135 5.31e-05 ***
## fuelsystemspdi 3434.9 2760.0 1.245 0.214823
## fuelsystemspfi 3492.5 6413.6 0.545 0.586703
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6141 on 191 degrees of freedom
## Multiple R-squared: 0.4468, Adjusted R-squared: 0.4092
## F-statistic: 11.87 on 13 and 191 DF, p-value: < 2.2e-16
prediccion <- predict(regresion)
prediccion
## 1 2 3 4 5 6 7 8
## 18116.417 18116.417 25098.385 15883.239 15883.239 15883.239 15883.239 15883.239
## 9 10 11 12 13 14 15 16
## 15883.239 15883.239 15883.239 15883.239 15883.239 15883.239 15883.239 15883.239
## 17 18 19 20 21 22 23 24
## 15883.239 15883.239 6612.921 7184.419 7184.419 7184.419 7184.419 15883.239
## 25 26 27 28 29 30 31 32
## 7184.419 7184.419 7184.419 15883.239 7184.419 12964.000 7555.545 7555.545
## 33 34 35 36 37 38 39 40
## 7555.545 7555.545 7555.545 7555.545 7555.545 7555.545 7555.545 7555.545
## 41 42 43 44 45 46 47 48
## 7555.545 15883.239 7184.419 7184.419 7184.419 7184.419 11048.000 18116.417
## 49 50 51 52 53 54 55 56
## 18116.417 25098.385 7184.419 7184.419 7184.419 7184.419 7184.419 12145.000
## 57 58 59 60 61 62 63 64
## 12145.000 12145.000 15645.000 7184.419 7184.419 7184.419 7184.419 15981.025
## 65 66 67 68 69 70 71 72
## 7184.419 15883.239 15981.025 15981.025 15981.025 15981.025 15981.025 25098.385
## 73 74 75 76 77 78 79 80
## 25098.385 25098.385 25098.385 15883.239 7184.419 7184.419 7184.419 10990.444
## 81 82 83 84 85 86 87 88
## 10990.444 7184.419 10990.444 10990.444 10990.444 7184.419 7184.419 10990.444
## 89 90 91 92 93 94 95 96
## 10990.444 7184.419 15981.025 7184.419 7184.419 7184.419 7184.419 7184.419
## 97 98 99 100 101 102 103 104
## 7184.419 7184.419 7184.419 7184.419 7184.419 25098.385 25098.385 25098.385
## 105 106 107 108 109 110 111 112
## 25098.385 25098.385 25098.385 15311.741 15409.526 15311.741 15409.526 15311.741
## 113 114 115 116 117 118 119 120
## 15409.526 15311.741 15409.526 15311.741 15409.526 15311.741 7184.419 10990.444
## 121 122 123 124 125 126 127 128
## 7184.419 7184.419 7184.419 7184.419 10990.444 15883.239 18377.971 18377.971
## 129 130 131 132 133 134 135 136
## 18377.971 31400.500 15883.239 15883.239 15883.239 15883.239 15883.239 15883.239
## 137 138 139 140 141 142 143 144
## 18116.417 18116.417 9679.150 9679.150 9679.150 9679.150 9679.150 18377.971
## 145 146 147 148 149 150 151 152
## 9679.150 18377.971 9679.150 18377.971 9679.150 18377.971 7184.419 7184.419
## 153 154 155 156 157 158 159 160
## 7184.419 7184.419 7184.419 7184.419 7184.419 7184.419 15981.025 15981.025
## 161 162 163 164 165 166 167 168
## 7184.419 7184.419 7184.419 7184.419 7184.419 18116.417 18116.417 15883.239
## 169 170 171 172 173 174 175 176
## 15883.239 15883.239 15883.239 15883.239 15883.239 15883.239 15981.025 15883.239
## 177 178 179 180 181 182 183 184
## 15883.239 15883.239 18116.417 18116.417 18116.417 18116.417 15981.025 15883.239
## 185 186 187 188 189 190 191 192
## 15981.025 15883.239 15883.239 15981.025 15883.239 15883.239 15883.239 15883.239
## 193 194 195 196 197 198 199 200
## 15981.025 15883.239 15883.239 15883.239 15883.239 15883.239 15883.239 15883.239
## 201 202 203 204 205
## 15883.239 15883.239 25098.385 15981.025 15883.239
R: No son son muy representativas, por lo que se debiese buscar mejores variables.
##Pregunta 5 Dado el modelo anterior, calcule la predicción del precio con un tipo de motor rotor y sistema de gasolina igual a idi (5 puntos)
prediccion2 <- predict(regresion, data.table(enginetype = "rotor" , fuelsystem = "idi"))
prediccion2
## 1
## 15742.79
##Pregunta 6 Realice usted un modelo a su preferencia, este debe predecir de mejor manera de mejor manera (7 ptos)
Calcule el predicho para cada observación.(2 ptos)
regresion2 <- lm(formula = price ~ enginesize + horsepower,
data = carprice)
sum2 <- summary(regresion2)
sum2
##
## Call:
## lm(formula = price ~ enginesize + horsepower, data = carprice)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10946.0 -1946.7 -218.8 1775.5 13403.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8389.73 822.53 -10.200 < 2e-16 ***
## enginesize 122.45 10.46 11.709 < 2e-16 ***
## horsepower 58.85 11.01 5.344 2.45e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3650 on 202 degrees of freedom
## Multiple R-squared: 0.7933, Adjusted R-squared: 0.7913
## F-statistic: 387.7 on 2 and 202 DF, p-value: < 2.2e-16
prediccion2 <- predict(regresion2)
prediccion2
## 1 2 3 4 5 6 7 8
## 14060.443 14060.443 19284.716 10959.429 15030.515 14736.278 14736.278 14736.278
## 9 10 11 12 13 14 15 16
## 15889.465 17066.414 10778.134 10778.134 18812.116 18812.116 18812.116 27911.924
## 17 18 19 20 21 22 23 24
## 27911.924 27911.924 1904.211 6749.818 6749.818 6632.123 6632.123 9612.512
## 25 26 27 28 29 30 31 32
## 6632.123 6632.123 6632.123 9612.512 11727.376 19244.878 6288.543 7347.796
## 33 34 35 36 37 38 39 40
## 4814.427 7347.796 7347.796 7347.796 7347.796 10140.317 10140.317 10140.317
## 41 42 43 44 45 46 47 48
## 10140.317 11023.028 10964.181 9791.985 6749.818 6749.818 11477.730 33558.743
## 49 50 51 52 53 54 55 56
## 33558.743 46946.019 6754.570 6754.570 6754.570 6754.570 6754.570 6125.148
## 57 58 59 60 61 62 63 64
## 6125.148 6125.148 9350.430 11491.986 11491.986 11491.986 11491.986 10315.038
## 65 66 67 68 69 70 71 72
## 11491.986 15814.540 12255.181 21256.304 21256.304 21256.304 21256.304 29384.219
## 73 74 75 76 77 78 79 80
## 29384.219 40151.874 39662.086 19051.148 6877.017 6877.017 6877.017 9612.512
## 81 82 83 84 85 86 87 88
## 11905.740 11727.376 19244.878 19244.878 19244.878 11727.376 11727.376 11905.740
## 89 90 91 92 93 94 95 96
## 11905.740 7548.100 7458.918 7548.100 7548.100 7548.100 7548.100 7548.100
## 97 98 99 100 101 102 103 104
## 7548.100 7548.100 7548.100 12012.109 12012.109 22717.985 22717.985 22717.985
## 105 106 107 108 109 110 111 112
## 23188.764 25542.661 23188.764 12012.109 15812.718 12012.109 15812.718 11894.414
## 113 114 115 116 117 118 119 120
## 15812.718 11894.414 15812.718 12012.109 15812.718 16374.501 6632.123 9612.512
## 121 122 123 124 125 126 127 128
## 6632.123 6632.123 7611.699 11727.376 19244.878 18514.948 27546.404 27546.404
## 129 130 131 132 133 134 135 136
## 27546.404 33415.069 13069.541 13069.541 12899.572 12899.572 12899.572 12899.572
## 137 138 139 140 141 142 143 144
## 15841.943 15841.943 7548.100 9130.406 9130.406 9660.033 9660.033 10366.202
## 145 146 147 148 149 150 151 152
## 9660.033 11366.609 9660.033 10366.202 9660.033 11366.609 6523.933 6523.933
## 153 154 155 156 157 158 159 160
## 6523.933 6523.933 6523.933 6523.933 7729.394 7729.394 8374.894 8374.894
## 161 162 163 164 165 166 167 168
## 7729.394 7729.394 7729.394 7729.394 7729.394 10200.986 10200.986 16313.832
## 169 170 171 172 173 174 175 176
## 16313.832 16313.832 16313.832 16313.832 16313.832 11962.766 9375.301 11962.766
## 177 178 179 180 181 182 183 184
## 11962.766 11962.766 22023.142 22023.142 21728.905 20504.434 6547.693 9959.023
## 185 186 187 188 189 190 191 192
## 6547.693 9959.023 9959.023 7489.252 10841.734 10253.260 10253.260 14736.278
## 193 194 195 196 197 198 199 200
## 7489.252 10135.565 15583.902 15583.902 15583.902 15583.902 17061.661 17061.661
## 201 202 203 204 205
## 15583.902 18290.884 20679.155 15602.911 15583.902
##Pregunta 7 Interprete la diferencia en los errores de predicción entre el Modelo 1 y el Modelo 2. ¿Qué modelo hace una mejor predicción dentro de muestra? (5 puntos)
carprice[, predict1 := predict(regresion)]
carprice[,predict2 := predict(regresion2)]
MAE1 <- sum(abs(carprice$predict1-carprice$price))/nrow(carprice)
MAE2 <- sum(abs(carprice$predict2-carprice$price))/nrow(carprice)
data.table(MAE1,MAE2)
## MAE1 MAE2
## 1: 3936.92 2595.422
R: Por lo tanto, la seguda prediccion realizada con las variables enginesize y horsepower, realiza una mejor prediccion dentro de la muestra que la primera.
##Pregunta 8 Realice validación cruzada (CV) a los modelos de la pregunta anterior por el método K-folds con 5 folds. ¿Se mantienen las conclusiones obtenidas en el análisis dentro de muestra? (9 puntos)
set.seed(12345)
setupKCV <- trainControl(method = "cv" , number = 100)