rm(list=ls())

library(data.table)
library(ggplot2)
library(leaflet)
library(data.table)

library(gganimate)
library(ggcorrplot)
library(plotly)
library(janitor)
library(RColorBrewer)
library(tidyverse)
library(caret)
library(jtools)
library(scales)
library(lubridate)
library(corrgram)
library(corrplot)
library(knitr)
library(moderndive)




path <- 'C:/Users/Rorro Bielenberg/Desktop/UAI/Data Science/Tarea 5/'
carprice<- fread(paste0(path,'CarPrice_Assignment.csv'))

##Pregunta 1: Muestre un histograma de distribución del precio.(4 ptos)

ggplot(carprice,aes(price)) +
  geom_histogram()

##Pregunta 2 Realice un gráfico de correlación entre las variables númericas, estas son, “wheelbase”, “carlength” “carwidth”,“carheight”, “curbweight”, “enginesize”,“boreratio”,“stroke”, “compressionratio”, “horsepower”, “peakrpm”,“citympg”, “highwaympg”,“price” (4 ptos)

variables <- carprice[,.(wheelbase, carlength, carwidth,carheight, curbweight, enginesize,boreratio,stroke, compressionratio, horsepower, peakrpm,citympg, highwaympg,price)]
corr <- cor(variables)
corr
##                   wheelbase  carlength   carwidth   carheight curbweight
## wheelbase         1.0000000  0.8745875  0.7951436  0.58943476  0.7763863
## carlength         0.8745875  1.0000000  0.8411183  0.49102946  0.8777285
## carwidth          0.7951436  0.8411183  1.0000000  0.27921032  0.8670325
## carheight         0.5894348  0.4910295  0.2792103  1.00000000  0.2955717
## curbweight        0.7763863  0.8777285  0.8670325  0.29557173  1.0000000
## enginesize        0.5693287  0.6833599  0.7354334  0.06714874  0.8505941
## boreratio         0.4887499  0.6064544  0.5591499  0.17107092  0.6484797
## stroke            0.1609590  0.1295326  0.1829417 -0.05530667  0.1687900
## compressionratio  0.2497858  0.1584137  0.1811286  0.26121423  0.1513617
## horsepower        0.3532945  0.5526230  0.6407321 -0.10880206  0.7507393
## peakrpm          -0.3604687 -0.2872422 -0.2200123 -0.32041072 -0.2662432
## citympg          -0.4704136 -0.6709087 -0.6427043 -0.04863963 -0.7574138
## highwaympg       -0.5440819 -0.7046616 -0.6772179 -0.10735763 -0.7974648
## price             0.5778156  0.6829200  0.7593253  0.11933623  0.8353049
##                   enginesize    boreratio      stroke compressionratio
## wheelbase         0.56932868  0.488749875  0.16095905      0.249785845
## carlength         0.68335987  0.606454358  0.12953261      0.158413706
## carwidth          0.73543340  0.559149909  0.18294169      0.181128627
## carheight         0.06714874  0.171070922 -0.05530667      0.261214226
## curbweight        0.85059407  0.648479749  0.16879004      0.151361740
## enginesize        1.00000000  0.583774327  0.20312859      0.028971360
## boreratio         0.58377433  1.000000000 -0.05590898      0.005197339
## stroke            0.20312859 -0.055908983  1.00000000      0.186110110
## compressionratio  0.02897136  0.005197339  0.18611011      1.000000000
## horsepower        0.80976865  0.573676823  0.08093954     -0.204326226
## peakrpm          -0.24465983 -0.254975528 -0.06796375     -0.435740514
## citympg          -0.65365792 -0.584531716 -0.04214475      0.324701425
## highwaympg       -0.67746991 -0.587011784 -0.04393093      0.265201389
## price             0.87414480  0.553173237  0.07944308      0.067983506
##                   horsepower     peakrpm     citympg  highwaympg       price
## wheelbase         0.35329448 -0.36046875 -0.47041361 -0.54408192  0.57781560
## carlength         0.55262297 -0.28724220 -0.67090866 -0.70466160  0.68292002
## carwidth          0.64073208 -0.22001230 -0.64270434 -0.67721792  0.75932530
## carheight        -0.10880206 -0.32041072 -0.04863963 -0.10735763  0.11933623
## curbweight        0.75073925 -0.26624318 -0.75741378 -0.79746479  0.83530488
## enginesize        0.80976865 -0.24465983 -0.65365792 -0.67746991  0.87414480
## boreratio         0.57367682 -0.25497553 -0.58453172 -0.58701178  0.55317324
## stroke            0.08093954 -0.06796375 -0.04214475 -0.04393093  0.07944308
## compressionratio -0.20432623 -0.43574051  0.32470142  0.26520139  0.06798351
## horsepower        1.00000000  0.13107251 -0.80145618 -0.77054389  0.80813882
## peakrpm           0.13107251  1.00000000 -0.11354438 -0.05427481 -0.08526715
## citympg          -0.80145618 -0.11354438  1.00000000  0.97133704 -0.68575134
## highwaympg       -0.77054389 -0.05427481  0.97133704  1.00000000 -0.69759909
## price             0.80813882 -0.08526715 -0.68575134 -0.69759909  1.00000000
ggcorrplot(corr,type = 'lower', lab = TRUE) +
  labs(title = "Gráfico de correlación",x=NULL,y=NULL) +
  theme_minimal() +
  theme(legend.position="none") 

##Pregunta 3 Realice dos gráficos de puntos, uno que relacione el precio price, el rendimiento en ciudad citympg y que se diferencie por tipo de tracción drivewheel , el segundo entre precio price, el rendimiento en carretera highwaympgy que se diferencie por tipo de tracción drivewheel (8 ptos)

ggplot(carprice,aes(x=price, y=citympg, color=drivewheel)) +  geom_point() + scale_color_brewer(palette='Paired') 

ggplot(carprice,aes(x=price, y=highwaympg, color=drivewheel)) +  geom_point() + scale_color_brewer(palette='Paired')

##Pregunta 4 Realice una regresión múltiple entre precio con las variables enginetype y fuelsystem. ¿Qué podrian decir de la significacia de las variables?(5 ptos)

Calcule el predicho para cada observación.(2 ptos)

regresion <-  lm(formula = price ~ enginetype + fuelsystem,
               data = carprice)

sum1 <- summary(regresion)
sum1
## 
## Call:
## lm(formula = price ~ enginetype + fuelsystem, data = carprice)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11599.4  -2871.7   -399.4   1660.6  25431.8 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       9788.7     2682.8   3.649 0.000340 ***
## enginetypedohcv  13284.1     6391.2   2.078 0.039002 *  
## enginetypel      -2804.7     2602.7  -1.078 0.282566    
## enginetypeohc    -2233.2     1941.6  -1.150 0.251503    
## enginetypeohcf     261.6     2446.3   0.107 0.914966    
## enginetypeohcv    6982.0     2458.2   2.840 0.004996 ** 
## enginetyperotor  -2471.4     6391.2  -0.387 0.699418    
## fuelsystem2bbl    -371.1     2010.6  -0.185 0.853751    
## fuelsystem4bbl    4827.7     7370.9   0.655 0.513276    
## fuelsystemidi     8425.5     2354.8   3.578 0.000439 ***
## fuelsystemmfi     5408.5     6413.6   0.843 0.400124    
## fuelsystemmpfi    8327.7     2013.8   4.135 5.31e-05 ***
## fuelsystemspdi    3434.9     2760.0   1.245 0.214823    
## fuelsystemspfi    3492.5     6413.6   0.545 0.586703    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6141 on 191 degrees of freedom
## Multiple R-squared:  0.4468, Adjusted R-squared:  0.4092 
## F-statistic: 11.87 on 13 and 191 DF,  p-value: < 2.2e-16
prediccion <- predict(regresion)
prediccion
##         1         2         3         4         5         6         7         8 
## 18116.417 18116.417 25098.385 15883.239 15883.239 15883.239 15883.239 15883.239 
##         9        10        11        12        13        14        15        16 
## 15883.239 15883.239 15883.239 15883.239 15883.239 15883.239 15883.239 15883.239 
##        17        18        19        20        21        22        23        24 
## 15883.239 15883.239  6612.921  7184.419  7184.419  7184.419  7184.419 15883.239 
##        25        26        27        28        29        30        31        32 
##  7184.419  7184.419  7184.419 15883.239  7184.419 12964.000  7555.545  7555.545 
##        33        34        35        36        37        38        39        40 
##  7555.545  7555.545  7555.545  7555.545  7555.545  7555.545  7555.545  7555.545 
##        41        42        43        44        45        46        47        48 
##  7555.545 15883.239  7184.419  7184.419  7184.419  7184.419 11048.000 18116.417 
##        49        50        51        52        53        54        55        56 
## 18116.417 25098.385  7184.419  7184.419  7184.419  7184.419  7184.419 12145.000 
##        57        58        59        60        61        62        63        64 
## 12145.000 12145.000 15645.000  7184.419  7184.419  7184.419  7184.419 15981.025 
##        65        66        67        68        69        70        71        72 
##  7184.419 15883.239 15981.025 15981.025 15981.025 15981.025 15981.025 25098.385 
##        73        74        75        76        77        78        79        80 
## 25098.385 25098.385 25098.385 15883.239  7184.419  7184.419  7184.419 10990.444 
##        81        82        83        84        85        86        87        88 
## 10990.444  7184.419 10990.444 10990.444 10990.444  7184.419  7184.419 10990.444 
##        89        90        91        92        93        94        95        96 
## 10990.444  7184.419 15981.025  7184.419  7184.419  7184.419  7184.419  7184.419 
##        97        98        99       100       101       102       103       104 
##  7184.419  7184.419  7184.419  7184.419  7184.419 25098.385 25098.385 25098.385 
##       105       106       107       108       109       110       111       112 
## 25098.385 25098.385 25098.385 15311.741 15409.526 15311.741 15409.526 15311.741 
##       113       114       115       116       117       118       119       120 
## 15409.526 15311.741 15409.526 15311.741 15409.526 15311.741  7184.419 10990.444 
##       121       122       123       124       125       126       127       128 
##  7184.419  7184.419  7184.419  7184.419 10990.444 15883.239 18377.971 18377.971 
##       129       130       131       132       133       134       135       136 
## 18377.971 31400.500 15883.239 15883.239 15883.239 15883.239 15883.239 15883.239 
##       137       138       139       140       141       142       143       144 
## 18116.417 18116.417  9679.150  9679.150  9679.150  9679.150  9679.150 18377.971 
##       145       146       147       148       149       150       151       152 
##  9679.150 18377.971  9679.150 18377.971  9679.150 18377.971  7184.419  7184.419 
##       153       154       155       156       157       158       159       160 
##  7184.419  7184.419  7184.419  7184.419  7184.419  7184.419 15981.025 15981.025 
##       161       162       163       164       165       166       167       168 
##  7184.419  7184.419  7184.419  7184.419  7184.419 18116.417 18116.417 15883.239 
##       169       170       171       172       173       174       175       176 
## 15883.239 15883.239 15883.239 15883.239 15883.239 15883.239 15981.025 15883.239 
##       177       178       179       180       181       182       183       184 
## 15883.239 15883.239 18116.417 18116.417 18116.417 18116.417 15981.025 15883.239 
##       185       186       187       188       189       190       191       192 
## 15981.025 15883.239 15883.239 15981.025 15883.239 15883.239 15883.239 15883.239 
##       193       194       195       196       197       198       199       200 
## 15981.025 15883.239 15883.239 15883.239 15883.239 15883.239 15883.239 15883.239 
##       201       202       203       204       205 
## 15883.239 15883.239 25098.385 15981.025 15883.239

R: No son son muy representativas, por lo que se debiese buscar mejores variables.

##Pregunta 5 Dado el modelo anterior, calcule la predicción del precio con un tipo de motor rotor y sistema de gasolina igual a idi (5 puntos)

prediccion2 <- predict(regresion, data.table(enginetype = "rotor" , fuelsystem = "idi"))
prediccion2
##        1 
## 15742.79

##Pregunta 6 Realice usted un modelo a su preferencia, este debe predecir de mejor manera de mejor manera (7 ptos)

Calcule el predicho para cada observación.(2 ptos)

regresion2 <-  lm(formula = price ~ enginesize + horsepower,
               data = carprice)

sum2 <- summary(regresion2)
sum2
## 
## Call:
## lm(formula = price ~ enginesize + horsepower, data = carprice)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10946.0  -1946.7   -218.8   1775.5  13403.1 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -8389.73     822.53 -10.200  < 2e-16 ***
## enginesize    122.45      10.46  11.709  < 2e-16 ***
## horsepower     58.85      11.01   5.344 2.45e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3650 on 202 degrees of freedom
## Multiple R-squared:  0.7933, Adjusted R-squared:  0.7913 
## F-statistic: 387.7 on 2 and 202 DF,  p-value: < 2.2e-16
prediccion2 <- predict(regresion2)
prediccion2
##         1         2         3         4         5         6         7         8 
## 14060.443 14060.443 19284.716 10959.429 15030.515 14736.278 14736.278 14736.278 
##         9        10        11        12        13        14        15        16 
## 15889.465 17066.414 10778.134 10778.134 18812.116 18812.116 18812.116 27911.924 
##        17        18        19        20        21        22        23        24 
## 27911.924 27911.924  1904.211  6749.818  6749.818  6632.123  6632.123  9612.512 
##        25        26        27        28        29        30        31        32 
##  6632.123  6632.123  6632.123  9612.512 11727.376 19244.878  6288.543  7347.796 
##        33        34        35        36        37        38        39        40 
##  4814.427  7347.796  7347.796  7347.796  7347.796 10140.317 10140.317 10140.317 
##        41        42        43        44        45        46        47        48 
## 10140.317 11023.028 10964.181  9791.985  6749.818  6749.818 11477.730 33558.743 
##        49        50        51        52        53        54        55        56 
## 33558.743 46946.019  6754.570  6754.570  6754.570  6754.570  6754.570  6125.148 
##        57        58        59        60        61        62        63        64 
##  6125.148  6125.148  9350.430 11491.986 11491.986 11491.986 11491.986 10315.038 
##        65        66        67        68        69        70        71        72 
## 11491.986 15814.540 12255.181 21256.304 21256.304 21256.304 21256.304 29384.219 
##        73        74        75        76        77        78        79        80 
## 29384.219 40151.874 39662.086 19051.148  6877.017  6877.017  6877.017  9612.512 
##        81        82        83        84        85        86        87        88 
## 11905.740 11727.376 19244.878 19244.878 19244.878 11727.376 11727.376 11905.740 
##        89        90        91        92        93        94        95        96 
## 11905.740  7548.100  7458.918  7548.100  7548.100  7548.100  7548.100  7548.100 
##        97        98        99       100       101       102       103       104 
##  7548.100  7548.100  7548.100 12012.109 12012.109 22717.985 22717.985 22717.985 
##       105       106       107       108       109       110       111       112 
## 23188.764 25542.661 23188.764 12012.109 15812.718 12012.109 15812.718 11894.414 
##       113       114       115       116       117       118       119       120 
## 15812.718 11894.414 15812.718 12012.109 15812.718 16374.501  6632.123  9612.512 
##       121       122       123       124       125       126       127       128 
##  6632.123  6632.123  7611.699 11727.376 19244.878 18514.948 27546.404 27546.404 
##       129       130       131       132       133       134       135       136 
## 27546.404 33415.069 13069.541 13069.541 12899.572 12899.572 12899.572 12899.572 
##       137       138       139       140       141       142       143       144 
## 15841.943 15841.943  7548.100  9130.406  9130.406  9660.033  9660.033 10366.202 
##       145       146       147       148       149       150       151       152 
##  9660.033 11366.609  9660.033 10366.202  9660.033 11366.609  6523.933  6523.933 
##       153       154       155       156       157       158       159       160 
##  6523.933  6523.933  6523.933  6523.933  7729.394  7729.394  8374.894  8374.894 
##       161       162       163       164       165       166       167       168 
##  7729.394  7729.394  7729.394  7729.394  7729.394 10200.986 10200.986 16313.832 
##       169       170       171       172       173       174       175       176 
## 16313.832 16313.832 16313.832 16313.832 16313.832 11962.766  9375.301 11962.766 
##       177       178       179       180       181       182       183       184 
## 11962.766 11962.766 22023.142 22023.142 21728.905 20504.434  6547.693  9959.023 
##       185       186       187       188       189       190       191       192 
##  6547.693  9959.023  9959.023  7489.252 10841.734 10253.260 10253.260 14736.278 
##       193       194       195       196       197       198       199       200 
##  7489.252 10135.565 15583.902 15583.902 15583.902 15583.902 17061.661 17061.661 
##       201       202       203       204       205 
## 15583.902 18290.884 20679.155 15602.911 15583.902

##Pregunta 7 Interprete la diferencia en los errores de predicción entre el Modelo 1 y el Modelo 2. ¿Qué modelo hace una mejor predicción dentro de muestra? (5 puntos)

carprice[, predict1 := predict(regresion)]
carprice[,predict2 := predict(regresion2)]

MAE1 <- sum(abs(carprice$predict1-carprice$price))/nrow(carprice)
MAE2 <- sum(abs(carprice$predict2-carprice$price))/nrow(carprice)

data.table(MAE1,MAE2)
##       MAE1     MAE2
## 1: 3936.92 2595.422

R: Por lo tanto, la seguda prediccion realizada con las variables enginesize y horsepower, realiza una mejor prediccion dentro de la muestra que la primera.

##Pregunta 8 Realice validación cruzada (CV) a los modelos de la pregunta anterior por el método K-folds con 5 folds. ¿Se mantienen las conclusiones obtenidas en el análisis dentro de muestra? (9 puntos)

set.seed(12345)
setupKCV <- trainControl(method = "cv" , number = 100)