suppressMessages(library(leaps))
suppressMessages(library(ggplot2))
suppressMessages(library(readr))
suppressMessages(library(corrplot))
suppressMessages(library(dplyr))
suppressMessages(library(fastDummies))
suppressMessages(library(glmnet))
suppressMessages(library(caret))
suppressMessages(library(pls))
suppressMessages(library(mctest))
suppressMessages(library(car))
options(scipen = 999)
raw_data = read.csv("ToyotaCorolla.csv")
raw_data$Id = NULL
raw_data$Model = NULL
raw_data
str(raw_data)
'data.frame': 1436 obs. of 35 variables:
$ Price : int 13500 13750 13950 14950 13750 12950 16900 18600 21500 12950 ...
$ Age_08_04 : int 23 23 24 26 30 32 27 30 27 23 ...
$ Mfg_Month : int 10 10 9 7 3 1 6 3 6 10 ...
$ Mfg_Year : int 2002 2002 2002 2002 2002 2002 2002 2002 2002 2002 ...
$ KM : int 46986 72937 41711 48000 38500 61000 94612 75889 19700 71138 ...
$ Fuel_Type : Factor w/ 3 levels "CNG","Diesel",..: 2 2 2 2 2 2 2 2 3 2 ...
$ HP : int 90 90 90 90 90 90 90 90 192 69 ...
$ Met_Color : int 1 1 1 0 0 0 1 1 0 0 ...
$ Automatic : int 0 0 0 0 0 0 0 0 0 0 ...
$ cc : int 2000 2000 2000 2000 2000 2000 2000 2000 1800 1900 ...
$ Doors : int 3 3 3 3 3 3 3 3 3 3 ...
$ Cylinders : int 4 4 4 4 4 4 4 4 4 4 ...
$ Gears : int 5 5 5 5 5 5 5 5 5 5 ...
$ Quarterly_Tax : int 210 210 210 210 210 210 210 210 100 185 ...
$ Weight : int 1165 1165 1165 1165 1170 1170 1245 1245 1185 1105 ...
$ Mfr_Guarantee : int 0 0 1 1 1 0 0 1 0 0 ...
$ BOVAG_Guarantee : int 1 1 1 1 1 1 1 1 1 1 ...
$ Guarantee_Period: int 3 3 3 3 3 3 3 3 3 3 ...
$ ABS : int 1 1 1 1 1 1 1 1 1 1 ...
$ Airbag_1 : int 1 1 1 1 1 1 1 1 1 1 ...
$ Airbag_2 : int 1 1 1 1 1 1 1 1 0 1 ...
$ Airco : int 0 1 0 0 1 1 1 1 1 1 ...
$ Automatic_airco : int 0 0 0 0 0 0 0 0 0 0 ...
$ Boardcomputer : int 1 1 1 1 1 1 1 1 0 1 ...
$ CD_Player : int 0 1 0 0 0 0 0 1 0 0 ...
$ Central_Lock : int 1 1 0 0 1 1 1 1 1 0 ...
$ Powered_Windows : int 1 0 0 0 1 1 1 1 1 0 ...
$ Power_Steering : int 1 1 1 1 1 1 1 1 1 1 ...
$ Radio : int 0 0 0 0 0 0 0 0 1 0 ...
$ Mistlamps : int 0 0 0 0 1 1 0 0 0 0 ...
$ Sport_Model : int 0 0 0 0 0 0 1 0 0 0 ...
$ Backseat_Divider: int 1 1 1 1 1 1 1 1 0 1 ...
$ Metallic_Rim : int 0 0 0 0 0 0 0 0 1 0 ...
$ Radio_cassette : int 0 0 0 0 0 0 0 0 1 0 ...
$ Tow_Bar : int 0 0 0 0 0 0 0 0 0 0 ...
summary(raw_data)
Price Age_08_04 Mfg_Month Mfg_Year KM
Min. : 4350 Min. : 1.00 Min. : 1.000 Min. :1998 Min. : 1
1st Qu.: 8450 1st Qu.:44.00 1st Qu.: 3.000 1st Qu.:1998 1st Qu.: 43000
Median : 9900 Median :61.00 Median : 5.000 Median :1999 Median : 63390
Mean :10731 Mean :55.95 Mean : 5.549 Mean :2000 Mean : 68533
3rd Qu.:11950 3rd Qu.:70.00 3rd Qu.: 8.000 3rd Qu.:2001 3rd Qu.: 87021
Max. :32500 Max. :80.00 Max. :12.000 Max. :2004 Max. :243000
Fuel_Type HP Met_Color Automatic cc
CNG : 17 Min. : 69.0 Min. :0.0000 Min. :0.00000 Min. : 1300
Diesel: 155 1st Qu.: 90.0 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.: 1400
Petrol:1264 Median :110.0 Median :1.0000 Median :0.00000 Median : 1600
Mean :101.5 Mean :0.6748 Mean :0.05571 Mean : 1577
3rd Qu.:110.0 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.: 1600
Max. :192.0 Max. :1.0000 Max. :1.00000 Max. :16000
Doors Cylinders Gears Quarterly_Tax Weight
Min. :2.000 Min. :4 Min. :3.000 Min. : 19.00 Min. :1000
1st Qu.:3.000 1st Qu.:4 1st Qu.:5.000 1st Qu.: 69.00 1st Qu.:1040
Median :4.000 Median :4 Median :5.000 Median : 85.00 Median :1070
Mean :4.033 Mean :4 Mean :5.026 Mean : 87.12 Mean :1072
3rd Qu.:5.000 3rd Qu.:4 3rd Qu.:5.000 3rd Qu.: 85.00 3rd Qu.:1085
Max. :5.000 Max. :4 Max. :6.000 Max. :283.00 Max. :1615
Mfr_Guarantee BOVAG_Guarantee Guarantee_Period ABS Airbag_1
Min. :0.0000 Min. :0.0000 Min. : 3.000 Min. :0.0000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.: 3.000 1st Qu.:1.0000 1st Qu.:1.0000
Median :0.0000 Median :1.0000 Median : 3.000 Median :1.0000 Median :1.0000
Mean :0.4095 Mean :0.8955 Mean : 3.815 Mean :0.8134 Mean :0.9708
3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 3.000 3rd Qu.:1.0000 3rd Qu.:1.0000
Max. :1.0000 Max. :1.0000 Max. :36.000 Max. :1.0000 Max. :1.0000
Airbag_2 Airco Automatic_airco Boardcomputer CD_Player
Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
Median :1.0000 Median :1.0000 Median :0.00000 Median :0.0000 Median :0.0000
Mean :0.7228 Mean :0.5084 Mean :0.05641 Mean :0.2946 Mean :0.2187
3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.0000
Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000 Max. :1.0000
Central_Lock Powered_Windows Power_Steering Radio Mistlamps
Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.0000 Min. :0.000
1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.000
Median :1.0000 Median :1.000 Median :1.0000 Median :0.0000 Median :0.000
Mean :0.5801 Mean :0.562 Mean :0.9777 Mean :0.1462 Mean :0.257
3rd Qu.:1.0000 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:1.000
Max. :1.0000 Max. :1.000 Max. :1.0000 Max. :1.0000 Max. :1.000
Sport_Model Backseat_Divider Metallic_Rim Radio_cassette Tow_Bar
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median :0.0000 Median :1.0000 Median :0.0000 Median :0.0000 Median :0.0000
Mean :0.3001 Mean :0.7702 Mean :0.2047 Mean :0.1455 Mean :0.2779
3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
Observaciones
* El valor maximo de cc es de 16000, demasiado alto considerando la media.
* El atributo Fuel_Type es de tipo char, requerira un proceso de encoding.
* El valor de Cylinder es constante.
sum(is.na(raw_data))
[1] 0
par(mfrow=c(1,2))
boxplot(raw_data$Price,main = "Precio Vehiculos Toyota Corolla",
ylab = "Precio ($)", notch = TRUE)
hist(raw_data$Price,main = "Precio Vehiculos Toyota Corolla")
par(mfrow = c(1, 2))
boxplot(raw_data$Age_08_04, main = "Age_08_04")
barplot(table(as.factor(raw_data$Age_08_04)), main = "Age_08_04")
par(mfrow = c(1, 2))
boxplot(raw_data$Mfg_Year, main = "Año de Mfg_Year")
barplot(table(as.factor(raw_data$Mfg_Year)), main = "Mfg_Year")
par(mfrow = c(1, 2))
boxplot(raw_data$KM,
main = "KM",
ylab = "KM",
notch = TRUE)
hist(raw_data$KM, main = "KM")
par(mfrow = c(1, 2))
boxplot(raw_data$HP,
main = "HP",
ylab = "HP",
notch = FALSE)
barplot(table(as.factor(raw_data$HP)), main = "HP")
par(mfrow = c(1, 2))
boxplot(raw_data$cc,
main = "Cilindrada",
ylab = "CC",
notch = FALSE)
barplot(table(as.factor(raw_data$cc)), main = "Cilindrada")
par(mfrow = c(1, 2))
boxplot(raw_data$Quarterly_Tax, main = "Quarterly_Tax")
hist(raw_data$Quarterly_Tax)
par(mfrow = c(1, 2))
boxplot(raw_data$Weight, main = "Peso(KG)")
hist(raw_data$Weight)
lbls <- c("0: No tiene", "1: Tiene")
par(mfrow = c(1, 2))
barplot(table(as.factor(raw_data$Fuel_Type)), main = "Fuel_Type")
pie(
x = table(raw_data$Radio_cassette),
labels = lbls,
main = "Radio Cassete"
)
par(mfrow = c(1, 2))
pie(
x = table(raw_data$Metallic_Rim),
labels = lbls,
main = "Metallic Rim"
)
pie(
x = table(raw_data$Backseat_Divider) ,
labels = lbls,
main = "Backseat_Divider"
)
par(mfrow = c(1, 3))
pie(x = table(raw_data$Mistlamps) ,
labels = lbls,
main = "Mistlamps")
pie(x = table(raw_data$Radio),
labels = lbls,
main = "Radio")
pie(x = table(raw_data$Sport_Model),
labels = lbls,
main = "Sport_Model")
par(mfrow = c(1, 3))
pie(
x = table(raw_data$Central_Lock),
labels = lbls,
main = "Central_Lock"
)
pie(x = table(raw_data$CD_Player),
labels = lbls,
main = "CD_Player")
pie(
x = table(raw_data$Boardcomputer),
labels = lbls,
main = "Boardcomputer"
)
par(mfrow = c(1, 3))
pie(x = table(raw_data$Airco),
labels = lbls,
main = "Airco")
pie(x = table(raw_data$Airbag_2),
labels = lbls,
main = "Airbag_2")
pie(x = table(raw_data$Airbag_1),
labels = lbls,
main = "Airbag_1")
par(mfrow = c(1, 2))
barplot(table(as.factor(raw_data$Guarantee_Period)), main = "Guarantee_Period")
pie(
x = table(raw_data$Automatic_airco),
labels = lbls,
main = "Automatic_airco"
)
par(mfrow = c(1, 3))
pie(
x = table(raw_data$Mfr_Guarantee),
labels = lbls,
main = "Mfr_Guarantee"
)
barplot(table(as.factor(raw_data$Gears)), main = "Gears")
pie(
x = table(raw_data$BOVAG_Guarantee),
labels = lbls,
main = "BOVAG_Guarantee"
)
par(mfrow = c(1, 3))
barplot(table(as.factor(raw_data$Doors)), main = "Doors")
pie(x = table(raw_data$Automatic),
labels = lbls,
main = "Automatic")
pie(x = table(raw_data$ABS),
labels = lbls,
main = "ABS")
hist(raw_data$Price,
col = "blue",
breaks = 60,
freq = F)
lines(density(raw_data$Price), col = "red", lwd = 2)
rug(raw_data$Price)
plot(Price ~ ., data = raw_data, col = "blue")
corrplot::corrplot(cor(select(raw_data,-c("Fuel_Type"))), type="upper", method="pie")
the standard deviation is zero
imcdiag(dplyr::select(raw_data, -c("Price", "Fuel_Type")), raw_data$Price)
Call:
imcdiag(x = dplyr::select(raw_data, -c("Price", "Fuel_Type")),
y = raw_data$Price)
All Individual Multicollinearity Diagnostics Result
VIF TOL Wi Fi Leamer CVIF Klein IND1 IND2
Age_08_04 Inf 0.0000 Inf Inf 0.0000 -Inf 1 0.0000 2.0114
Mfg_Month Inf 0.0000 Inf Inf 0.0000 -Inf 1 0.0000 2.0114
Mfg_Year Inf 0.0000 Inf Inf 0.0000 -Inf 1 0.0000 2.0114
KM 1.8647 0.5363 37.9120 39.1629 0.7323 -0.0556 0 0.0115 0.9327
HP 1.6023 0.6241 26.4092 27.2805 0.7900 -0.0478 0 0.0133 0.7561
Met_Color 1.1398 0.8773 6.1308 6.3331 0.9367 -0.0340 0 0.0187 0.2468
Automatic 1.0805 0.9255 3.5309 3.6474 0.9620 -0.0322 0 0.0198 0.1499
cc 1.2170 0.8217 9.5136 9.8275 0.9065 -0.0363 0 0.0175 0.3586
Doors 1.2554 0.7966 11.1979 11.5674 0.8925 -0.0374 0 0.0170 0.4092
Cylinders 2.0001 0.5000 43.8472 45.2939 NA -0.0596 0 0.0110 1.0057
Gears 1.2599 0.7937 11.3958 11.7718 0.8909 -0.0376 0 0.0169 0.4149
Quarterly_Tax 2.7801 0.3597 78.0447 80.6197 0.5998 -0.0829 0 0.0077 1.2879
Weight 3.2137 0.3112 97.0581 100.2604 0.5578 -0.0958 0 0.0066 1.3855
Mfr_Guarantee 1.1983 0.8345 8.6960 8.9830 0.9135 -0.0357 0 0.0178 0.3329
BOVAG_Guarantee 1.3712 0.7293 16.2736 16.8105 0.8540 -0.0409 0 0.0156 0.5445
Guarantee_Period 1.5381 0.6502 23.5907 24.3691 0.8063 -0.0458 0 0.0139 0.7036
ABS 2.2232 0.4498 53.6282 55.3976 0.6707 -0.0663 0 0.0096 1.1066
Airbag_1 1.5989 0.6254 26.2590 27.1253 0.7908 -0.0477 0 0.0134 0.7534
Airbag_2 3.0894 0.3237 91.6074 94.6299 0.5689 -0.0921 0 0.0069 1.3603
Airco 1.8361 0.5446 36.6558 37.8652 0.7380 -0.0547 0 0.0116 0.9159
Automatic_airco 1.7419 0.5741 32.5257 33.5988 0.7577 -0.0519 0 0.0123 0.8566
Boardcomputer 2.6305 0.3802 71.4869 73.8455 0.6166 -0.0784 0 0.0081 1.2467
CD_Player 1.5503 0.6450 24.1291 24.9253 0.8031 -0.0462 0 0.0138 0.7140
Central_Lock 4.5886 0.2179 157.3372 162.5283 0.4668 -0.1368 0 0.0047 1.5730
Powered_Windows 4.6078 0.2170 158.1800 163.3990 0.4659 -0.1373 0 0.0046 1.5749
Power_Steering 1.5557 0.6428 24.3626 25.1664 0.8018 -0.0464 0 0.0137 0.7184
Radio 62.3090 0.0160 2688.0184 2776.7064 0.1267 -1.8572 1 0.0003 1.9791
Mistlamps 2.0750 0.4819 47.1335 48.6886 0.6942 -0.0618 0 0.0103 1.0421
Sport_Model 1.4606 0.6846 20.1946 20.8609 0.8274 -0.0435 0 0.0146 0.6343
Backseat_Divider 2.5379 0.3940 67.4257 69.6504 0.6277 -0.0756 0 0.0084 1.2188
Metallic_Rim 1.3400 0.7463 14.9077 15.3995 0.8639 -0.0399 0 0.0159 0.5104
Radio_cassette 62.1291 0.0161 2680.1284 2768.5561 0.1269 -1.8518 1 0.0003 1.9790
Tow_Bar 1.1445 0.8738 6.3348 6.5438 0.9347 -0.0341 0 0.0187 0.2539
1 --> COLLINEARITY is detected by the test
0 --> COLLINEARITY is not detected by the test
HP , Automatic , Doors , Guarantee_Period , ABS , Boardcomputer , Central_Lock , Powered_Windows , Power_Steering , Mistlamps , Backseat_Divider , coefficient(s) are non-significant may be due to multicollinearity
R-square of y on all x: 0.9058
* use method argument to check which regressors may be the reason of collinearity
===================================
El atributo CC presenta un outlier(valor atípico) de CC = 16000. No es un valor coherente con el contexto de un vehiculo Toyota Corolla. Considero que probablemente fue un error y supongo que se agrego un cero de más, siendo el valor correcto 1600.
El atributo Guarantee_Period presenta un outlier de Guarantee_Period = 13.Considero que probablemente fue un error y decido imputar el valor 12.
El atributo KM presenta outliers para valores superiores a 150000 y valores menos a 20000. Si bien son valores coherentes dentro del contexto de vehiculos, al estar la mayor concentracion de los vehiculos dentro del intervalo (10000,120000), decido recortar el dataSet, reduciendo su tamaño un 12%.
Tras realizar estas operaciones, el dataset restante posee un 72% de las instancias del dataset original.
Por ultimo quitamos también los atributos Fuel_Type y Automatic_airco ya que luego de la limpieza eran prácticamente casi constante con entre 6 a 2 valores distintos a los normales, es decir prácticamente todos eran tipo Petrol y solo había 6 que tenían Automatic_airco(esto nos dio problema a la hora del escalado ), por lo cual concluimos que era lo mejor sacarlos ya que en estos momentos no nos aporta información significativa
clean_data <- raw_data
clean_data$Cylinders <- NULL
clean_data$Mfg_Month <- NULL
clean_data$Mfg_Year <- NULL
clean_data$cc <- ifelse(clean_data$cc == 16000, 1600, raw_data$cc)
clean_data <- filter(clean_data, clean_data$Weight < 1100)
clean_data <- filter(clean_data, clean_data$Weight > 1010)
clean_data <- filter(clean_data, clean_data$KM > 20000)
clean_data <- filter(clean_data, clean_data$KM < 150000)
clean_data = subset(clean_data, !(KM / Age_08_04 > 3500 |
KM / Age_08_04 < 10))
clean_data = subset(clean_data, Guarantee_Period < 15)
clean_data[, c("Fuel_Type","Automatic_airco")] <- NULL
clean_data
pca.dataset <- clean_data[, -1]
dim(pca.dataset)
[1] 1039 29
apply(X = pca.dataset, MARGIN = 2, FUN = mean)
Age_08_04 KM HP Met_Color Automatic
61.18190568 67759.09720885 103.62656400 0.66794995 0.04908566
cc Doors Gears Quarterly_Tax Weight
1518.45235804 3.97882579 5.03272377 74.79114533 1052.63426372
Mfr_Guarantee BOVAG_Guarantee Guarantee_Period ABS Airbag_1
0.43599615 0.91049086 3.65543792 0.79595765 0.97112608
Airbag_2 Airco Boardcomputer CD_Player Central_Lock
0.70837344 0.46198268 0.21847931 0.16169394 0.54571704
Powered_Windows Power_Steering Radio Mistlamps Sport_Model
0.53512993 0.97690087 0.15688162 0.25601540 0.25216554
Backseat_Divider Metallic_Rim Radio_cassette Tow_Bar
0.79210780 0.21462945 0.15591915 0.31472570
apply(X = pca.dataset, MARGIN = 2, FUN = var)
Age_08_04 KM HP Met_Color
171.40329463 674170346.25547385 99.18603741 0.22200649
Automatic cc Doors Gears
0.04672122 16208.81251611 0.91477280 0.03361020
Quarterly_Tax Weight Mfr_Guarantee BOVAG_Guarantee
279.79930495 472.77747612 0.24614041 0.08157577
Guarantee_Period ABS Airbag_1 Airbag_2
4.50351327 0.16256553 0.02806723 0.20677953
Airco Boardcomputer CD_Player Central_Lock
0.24879414 0.17091059 0.13567959 0.24814879
Powered_Windows Power_Steering Radio Mistlamps
0.24900555 0.02258730 0.13239720 0.19065501
Sport_Model Backseat_Divider Metallic_Rim Radio_cassette
0.18875976 0.16483168 0.16872604 0.13173516
Tow_Bar
0.21588121
pca.model <- prcomp( scale(pca.dataset[, -1]))
names(pca.model)
[1] "sdev" "rotation" "center" "scale" "x"
** PCA identifica aquellas direcciones en las que la varianza es mayor. Como la varianza se mide elevada al cuadrado, si antes de calcular no se estandarizan todas los predictores para que tengan media 0 y desviación estándar 1, aquellas variables cuyo valor sin escalar sea mayor dominarán al resto.
pca.model$rotation[, 1:5]
PC1 PC2 PC3 PC4 PC5
KM 0.040350094 -0.050108978 0.136548576 -0.067189304 0.38755008
HP -0.253999330 -0.275481830 -0.215682276 0.077923749 0.20851253
Met_Color -0.084738496 -0.033299302 -0.099831950 -0.142793199 -0.08972053
Automatic 0.045771728 0.008266144 0.003453250 0.007064506 -0.02078867
cc -0.258037463 -0.284115625 -0.207807429 0.085724610 0.24297055
Doors -0.141177395 -0.160671718 -0.273341662 0.118035572 0.03997757
Gears -0.110239944 -0.021876181 0.280431657 -0.166644392 0.13998534
Quarterly_Tax -0.225280867 -0.006211802 -0.164565637 0.062624440 0.20013145
Weight -0.293321630 -0.248059141 -0.230904531 0.095221252 0.20799911
Mfr_Guarantee -0.109965252 0.106050716 -0.171956286 0.015271211 -0.13458451
BOVAG_Guarantee -0.142120603 0.212430329 -0.095978258 -0.161050401 0.07673289
Guarantee_Period 0.163552243 -0.304701521 -0.003874779 0.030985940 -0.13997754
ABS -0.196663781 0.199415959 -0.089569574 -0.120884137 -0.14579308
Airbag_1 -0.165346530 0.177441720 -0.074591934 -0.250195231 0.05431843
Airbag_2 -0.247750596 0.333159318 -0.099857302 -0.069939302 -0.04592466
Airco -0.287957518 -0.156094756 0.158585114 0.007819130 -0.16141267
Boardcomputer -0.161042843 0.103683774 -0.209504317 0.103824604 -0.42532866
CD_Player -0.134251152 0.085951689 -0.147042540 0.179102889 -0.32630399
Central_Lock -0.281555998 -0.198380984 0.300191486 -0.059075329 -0.15732043
Powered_Windows -0.281685215 -0.180986295 0.318315436 -0.043306049 -0.16665923
Power_Steering -0.151182776 0.142834914 -0.065338332 -0.240536972 0.06137063
Radio 0.085709143 -0.229861129 -0.153997364 -0.563039752 -0.14705343
Mistlamps -0.294899586 -0.036768320 0.323717657 0.001192913 -0.08710399
Sport_Model 0.003524285 0.217098400 0.105832736 -0.181062877 0.35642718
Backseat_Divider -0.241877015 0.338818879 -0.053071601 -0.100070152 0.10517785
Metallic_Rim -0.182742572 -0.028452166 0.322116342 -0.077510618 0.03987794
Radio_cassette 0.088408063 -0.228697051 -0.155135978 -0.559411000 -0.14286628
Tow_Bar -0.017146109 -0.126980091 -0.169823321 -0.113124656 0.07315011
biplot(x = pca.model, cex = 0.6, col = c("blue4", "brown3"), scale = 1)
pca.model$sdev^2
[1] 4.4410167 3.2928329 2.2012718 1.9515595 1.7974601 1.3602633 1.1998512 1.1355710
[9] 1.0916158 1.0094177 0.8930742 0.8340511 0.8024924 0.7379852 0.7294365 0.6875231
[17] 0.6293235 0.6008787 0.5032505 0.4788848 0.4477867 0.4181100 0.2782180 0.1982454
[25] 0.1266767 0.1050313 0.0378362 0.0103357
prop.variance <- pca.model$sdev^2/sum(pca.model$sdev^2)
prop.variance
[1] 0.1586077401 0.1176011745 0.0786168511 0.0696985519 0.0641950046 0.0485808325
[7] 0.0428518283 0.0405561072 0.0389862776 0.0360506307 0.0318955066 0.0297875386
[13] 0.0286604429 0.0263566136 0.0260513026 0.0245543971 0.0224758401 0.0214599530
[19] 0.0179732322 0.0171030285 0.0159923817 0.0149325006 0.0099363557 0.0070801942
[25] 0.0045241693 0.0037511194 0.0013512928 0.0003691323
ggplot(data = data.frame(prop.variance, pc = 1:(dim(pca.model$x)[2])),
aes(x = pc, y = prop.variance)) +
geom_col(width = 0.3) +
scale_y_continuous(limits = c(0,0.2)) +
theme_bw() +
labs(x = "Componente principal",
y = "Prop. de varianza explicada")
ggplot(data = data.frame(prop.variance, pc = 1: (dim(pca.model$x)[2])),
aes(x = pc, y = prop.variance)) +
geom_col(width = 0.3) +
scale_y_continuous(limits = c(0,1)) +
theme_bw() +
labs(x = "Componentes",
y = "Prop. de varianza explicada")
prop.cumulative <- cumsum(prop.variance)
prop.cumulative
[1] 0.1586077 0.2762089 0.3548258 0.4245243 0.4887193 0.5373002 0.5801520 0.6207081
[9] 0.6596944 0.6957450 0.7276405 0.7574280 0.7860885 0.8124451 0.8384964 0.8630508
[17] 0.8855266 0.9069866 0.9249598 0.9420629 0.9580552 0.9729877 0.9829241 0.9900043
[25] 0.9945285 0.9982796 0.9996309 1.0000000
ggplot(data = data.frame(prop.cumulative, pc = 1: (dim(pca.model$x)[2])),
aes(x = pc, y = prop.cumulative)) +
geom_col(width = 0.3) +
scale_y_continuous(limits = c(0,1)) +
theme_bw() +
labs(x = "Componentes",
y = "Prop. de varianza acumulada explicada")
ggplot(data = data.frame(prop.cumulative, pc = factor(1:(dim(pca.model$x)[2]))),
aes(x = pc, y = prop.cumulative, group = 1)) +
geom_point() +
geom_line() +
geom_label(aes(label = round(prop.cumulative,2))) +
theme_bw() +
labs(x = "Componentes principales",
y = "Prop. varianza explicada acumulada")
pcr.dataset <- clean_data
dim(pcr.dataset)
[1] 1039 30
set.seed(14341)
indices <- createDataPartition(pcr.dataset$Price, p = 0.8, list = FALSE)
summary(pcr.dataset[-indices,])
Price Age_08_04 KM HP Met_Color
Min. : 5600 Min. :13.0 Min. : 20629 Min. : 86.0 Min. :0.0000
1st Qu.: 8350 1st Qu.:50.0 1st Qu.: 47690 1st Qu.: 97.0 1st Qu.:0.0000
Median : 9500 Median :63.0 Median : 66718 Median :110.0 Median :1.0000
Mean : 9762 Mean :61.0 Mean : 67914 Mean :103.1 Mean :0.6184
3rd Qu.:10950 3rd Qu.:72.5 3rd Qu.: 82011 3rd Qu.:110.0 3rd Qu.:1.0000
Max. :18500 Max. :80.0 Max. :146197 Max. :110.0 Max. :1.0000
Automatic cc Doors Gears Quarterly_Tax
Min. :0.00000 Min. :1300 Min. :2.000 Min. :5.000 Min. :19.00
1st Qu.:0.00000 1st Qu.:1400 1st Qu.:3.000 1st Qu.:5.000 1st Qu.:69.00
Median :0.00000 Median :1600 Median :4.000 Median :5.000 Median :69.00
Mean :0.04348 Mean :1511 Mean :3.976 Mean :5.034 Mean :73.45
3rd Qu.:0.00000 3rd Qu.:1600 3rd Qu.:5.000 3rd Qu.:5.000 3rd Qu.:85.00
Max. :1.00000 Max. :1600 Max. :5.000 Max. :6.000 Max. :85.00
Weight Mfr_Guarantee BOVAG_Guarantee Guarantee_Period ABS
Min. :1015 Min. :0.0000 Min. :0.0000 Min. : 3.000 Min. :0.0000
1st Qu.:1035 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.: 3.000 1st Qu.:1.0000
Median :1050 Median :0.0000 Median :1.0000 Median : 3.000 Median :1.0000
Mean :1053 Mean :0.4106 Mean :0.8841 Mean : 3.667 Mean :0.8068
3rd Qu.:1075 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 3.000 3rd Qu.:1.0000
Max. :1090 Max. :1.0000 Max. :1.0000 Max. :12.000 Max. :1.0000
Airbag_1 Airbag_2 Airco Boardcomputer CD_Player
Min. :0.000 Min. :0.000 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.:1.000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median :1.000 Median :1.000 Median :0.0000 Median :0.0000 Median :0.0000
Mean :0.971 Mean :0.686 Mean :0.4058 Mean :0.1981 Mean :0.1594
3rd Qu.:1.000 3rd Qu.:1.000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
Max. :1.000 Max. :1.000 Max. :1.0000 Max. :1.0000 Max. :1.0000
Central_Lock Powered_Windows Power_Steering Radio Mistlamps
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median :1.0000 Median :0.0000 Median :1.0000 Median :0.0000 Median :0.0000
Mean :0.5121 Mean :0.4976 Mean :0.9903 Mean :0.1739 Mean :0.2367
3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.0000
Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
Sport_Model Backseat_Divider Metallic_Rim Radio_cassette Tow_Bar
Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
Median :0.0000 Median :1.0000 Median :0.0000 Median :0.0000 Median :0.0000
Mean :0.3188 Mean :0.7874 Mean :0.1787 Mean :0.1739 Mean :0.3382
3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:1.0000
Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
set.seed(1)
pcr.model <-
pcr(
formula = Price ~ .,
data = pcr.dataset[indices,],
scale = TRUE
)
summary(pcr.model)
Data: X dimension: 832 29
Y dimension: 832 1
Fit method: svdpc
Number of components considered: 29
TRAINING: % variance explained
1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps 9 comps
X 15.77 27.01 35.27 42.57 49.39 54.19 58.41 62.30 66.15
Price 30.88 30.93 57.74 66.70 67.17 68.53 68.76 68.79 69.63
10 comps 11 comps 12 comps 13 comps 14 comps 15 comps 16 comps 17 comps
X 69.67 72.72 75.66 78.47 80.98 83.46 85.84 88.02
Price 69.63 70.23 70.74 70.82 71.44 71.49 71.91 72.13
18 comps 19 comps 20 comps 21 comps 22 comps 23 comps 24 comps 25 comps
X 90.12 91.88 93.54 95.09 96.57 97.59 98.38 99.03
Price 72.54 72.65 73.21 73.40 73.41 74.34 78.22 78.25
26 comps 27 comps 28 comps 29 comps
X 99.47 99.80 99.96 100.00
Price 78.29 78.42 78.42 78.44
validationplot(pcr.model, val.type = "RMSEP")
Se observa que el valor minimo de RMSE se obtiene cuando el numero de componentes es 32 aproximadamente, sin embargo esto produce un modelo con poca flexibilidad, inadaptable para el nuevo ingreso de datos. Se tomó como cantidad de componentes 5, dado que luego de este valor el descenso del RMSE es muy pequeño.
pcr.model.pred <-
predict(pcr.model, newdata = pcr.dataset[-indices,], ncomp = 5)
MSE.test <-
mean((pcr.model.pred - pcr.dataset[-indices,]$Price) ^ 2)
MSE.test
[1] 1339993
pcr.model <-
pcr(
formula = Price ~ .,
data = pcr.dataset[-indices, ],
scale = TRUE,
validation = "CV"
)
summary(pcr.model)
Data: X dimension: 207 29
Y dimension: 207 1
Fit method: svdpc
Number of components considered: 29
VALIDATION: RMSEP
Cross-validated using 10 random segments.
(Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
CV 2002 1768 1763 1322 1187 1155 1159 1167
adjCV 2002 1765 1762 1320 1181 1148 1154 1163
8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
CV 1165 1123 1131 1140 1133 1136 1109 1108
adjCV 1163 1118 1125 1135 1126 1131 1103 1099
16 comps 17 comps 18 comps 19 comps 20 comps 21 comps 22 comps 23 comps
CV 1113 1094 1120 1099 1098 1114 1124 1062
adjCV 1108 1089 1116 1091 1090 1105 1114 1052
24 comps 25 comps 26 comps 27 comps 28 comps 29 comps
CV 1062 1059 1061 1089 1103 1127
adjCV 1052 1049 1052 1078 1091 1068
TRAINING: % variance explained
1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps 9 comps
X 16.43 28.84 38.07 45.34 51.22 56.54 61.41 65.51 69.27
Price 23.97 25.79 58.04 67.06 69.22 69.44 69.47 69.96 71.58
10 comps 11 comps 12 comps 13 comps 14 comps 15 comps 16 comps 17 comps
X 72.77 75.90 78.73 81.50 84.13 86.23 88.31 90.23
Price 72.01 72.47 73.33 73.81 74.83 75.23 75.33 75.92
18 comps 19 comps 20 comps 21 comps 22 comps 23 comps 24 comps 25 comps
X 92.02 93.72 95.06 96.30 97.33 98.06 98.70 99.27
Price 76.54 77.22 77.48 77.48 77.49 79.99 80.06 80.12
26 comps 27 comps 28 comps 29 comps
X 99.69 99.99 100.0 100.00
Price 80.22 80.26 80.3 80.54
pcr.model.cv <- MSEP(pcr.model, estimate = "CV")
which.min(pcr.model.cv$val)
[1] 26
plot(
pcr.model.cv$val,
main = "MSE vs N° componentes",
type = "l",
ylab = "MSE",
col = "blue",
xlab = "Componentes"
)
plot(
as.numeric(sqrt(pcr.model$validation$PRESS)),
type = "b",
pch = 19,
ylab = expression(sqrt("PRESS"))
)
axis(side = 2, at = 1:32)
pcr.model.pred <-
predict(pcr.model, newdata = pcr.dataset[-indices,])
mse.test <-
mean((pcr.model.pred - pcr.dataset[-indices,]$Price) ^ 2)
mse.test
[1] 1142994
sqrt(mse.test)
[1] 1069.109
pls.dataset <- clean_data
indices <-
createDataPartition(pls.dataset$Price, p = 0.8, list = FALSE)
set.seed(124543)
pls.model <-
plsr(
formula = Price ~ .,
data = pls.dataset[indices, ],
scale = FALSE,
validation = "CV"
)
summary(pls.model)
Data: X dimension: 832 29
Y dimension: 832 1
Fit method: kernelpls
Number of components considered: 29
VALIDATION: RMSEP
Cross-validated using 10 random segments.
(Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
CV 1879 1631 1558 1077 967.7 966.5 955.7 913.2
adjCV 1879 1630 1557 1076 967.2 966.1 954.2 908.1
8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
CV 916.2 914.5 915.2 917.4 920.1 920.3 920.0 919.8
adjCV 914.5 913.1 913.6 915.6 918.0 918.2 917.8 917.7
16 comps 17 comps 18 comps 19 comps 20 comps 21 comps 22 comps 23 comps
CV 919.7 919.9 919.7 919.1 919.3 919.9 920.5 920.4
adjCV 917.5 917.8 917.5 917.1 917.2 917.8 918.3 918.2
24 comps 25 comps 26 comps 27 comps 28 comps 29 comps
CV 920.3 920.3 920.3 920.3 920.3 920.3
adjCV 918.1 918.1 918.1 918.1 918.1 918.1
TRAINING: % variance explained
1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps 9 comps
X 100.00 100.00 100.00 100.0 100.00 100.00 100.00 100.00 100.00
Price 24.73 31.59 67.78 73.9 73.96 75.17 77.11 77.26 77.74
10 comps 11 comps 12 comps 13 comps 14 comps 15 comps 16 comps 17 comps
X 100.00 100.00 100.00 100.00 100.00 100.00 100.00 100.00
Price 77.83 77.89 77.93 77.96 77.97 77.98 77.98 77.99
18 comps 19 comps 20 comps 21 comps 22 comps 23 comps 24 comps 25 comps
X 100.00 100.00 100.00 100.00 100 100 100 100
Price 77.99 77.99 77.99 77.99 78 78 78 78
26 comps 27 comps 28 comps 29 comps
X 100 100 100 100
Price 78 78 78 78
pls.model.cv <- MSEP(pls.model, estimate = "CV")
which.min(pls.model.cv$val)
[1] 8
plot(
pls.model.cv$val,
main = "MSE vs N Componentes",
type = "l",
ylab = "MSE",
col = "blue",
xlab = "Componentes"
)
Se observa que el menor valor de MSE, se encuentra para valores de 9 componentes. Sin embargo en 5 componentes se encuentra un valor de mse cuya disminucion es muy poca para cantidades de componentes posteriores.
pls.model.pred <-
predict(pls.model, newdata = pls.dataset[-indices,], ncomp = 9)
mse.test <-
mean((pls.model.pred - pls.dataset[-indices,]$Price) ^ 2)
mse.test
[1] 754848.9
set.seed(1223)
pls.model <-
plsr(
formula = Price ~ .,
data = pls.dataset[-indices, ],
scale = TRUE,
validation = "CV"
)
summary(pls.model)
Data: X dimension: 207 29
Y dimension: 207 1
Fit method: kernelpls
Number of components considered: 29
VALIDATION: RMSEP
Cross-validated using 10 random segments.
(Intercept) 1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps
CV 1921 1150 1014 959.9 940.2 924.6 917.8 910.6
adjCV 1921 1148 1010 952.2 932.5 915.0 909.8 902.4
8 comps 9 comps 10 comps 11 comps 12 comps 13 comps 14 comps 15 comps
CV 905.2 901.9 900.2 901.5 900.3 899.8 899.2 902.5
adjCV 897.5 894.5 893.0 894.2 893.1 892.6 892.1 895.1
16 comps 17 comps 18 comps 19 comps 20 comps 21 comps 22 comps 23 comps
CV 903.3 904 907.6 903.1 900.6 900.8 898.2 897.2
adjCV 895.7 896 898.6 894.6 892.5 892.9 890.5 889.6
24 comps 25 comps 26 comps 27 comps 28 comps 29 comps
CV 897.0 897.0 896.9 897.0 897.0 897.0
adjCV 889.4 889.4 889.4 889.4 889.4 889.4
TRAINING: % variance explained
1 comps 2 comps 3 comps 4 comps 5 comps 6 comps 7 comps 8 comps 9 comps
X 13.93 23.78 28.27 34.26 38.29 45.77 49.47 55.53 59.95
Price 66.66 75.95 80.62 81.91 83.10 83.47 83.76 83.83 83.87
10 comps 11 comps 12 comps 13 comps 14 comps 15 comps 16 comps 17 comps
X 63.54 65.83 68.39 70.64 73.00 75.47 77.57 79.71
Price 83.88 83.90 83.91 83.91 83.93 83.95 84.01 84.08
18 comps 19 comps 20 comps 21 comps 22 comps 23 comps 24 comps 25 comps
X 81.16 83.54 85.10 86.85 89.06 90.77 92.33 94.43
Price 84.18 84.20 84.21 84.22 84.22 84.22 84.22 84.22
26 comps 27 comps 28 comps 29 comps
X 96.38 97.90 100.00 100.03
Price 84.22 84.22 84.22 84.22
pls.model.pred <-
predict(pls.model, newdata = pls.dataset[-indices,])
mse.test <-
mean((pls.model.pred - pls.dataset[-indices,]$Price) ^ 2)
mse.test
[1] 623256.8
sqrt(mse.test)
[1] 789.4662
Se obtiene un valor de RMSE menor al valor obtenido con PCR. Es una alternativa comparable al valor obtenido mediante Lasso en el TP 4.
Luego del trabajo realizado nos dimos cuenta de que estos tipos de métodos son muy susceptibles a los outlinears y que principalmente sirven para aquellas situaciones en las cuales uno afronta un conjunto de variables correlacionadas.
Consideramos que los datos los resultados obtenidos son significativos ya que representan el 72% de los datos iniciales.
Notamos que la unica forma de determinar que metodo emplear para realizar una seleccion de variables es la comparacion de los valores de error mediante un tipo de validacion aplicada en forma comun para todos los metodos.