Para mayores referencias, consulta en:
data = read.csv('imports-85.data', sep = ',',header = F)
head(data,10)
## V1 V2 V3 V4 V5 V6 V7 V8 V9 V10 V11 V12
## 1 3 ? alfa-romero gas std two convertible rwd front 88.6 168.8 64.1
## 2 3 ? alfa-romero gas std two convertible rwd front 88.6 168.8 64.1
## 3 1 ? alfa-romero gas std two hatchback rwd front 94.5 171.2 65.5
## 4 2 164 audi gas std four sedan fwd front 99.8 176.6 66.2
## 5 2 164 audi gas std four sedan 4wd front 99.4 176.6 66.4
## 6 2 ? audi gas std two sedan fwd front 99.8 177.3 66.3
## 7 1 158 audi gas std four sedan fwd front 105.8 192.7 71.4
## 8 1 ? audi gas std four wagon fwd front 105.8 192.7 71.4
## 9 1 158 audi gas turbo four sedan fwd front 105.8 192.7 71.4
## 10 0 ? audi gas turbo two hatchback 4wd front 99.5 178.2 67.9
## V13 V14 V15 V16 V17 V18 V19 V20 V21 V22 V23 V24 V25 V26
## 1 48.8 2548 dohc four 130 mpfi 3.47 2.68 9.0 111 5000 21 27 13495
## 2 48.8 2548 dohc four 130 mpfi 3.47 2.68 9.0 111 5000 21 27 16500
## 3 52.4 2823 ohcv six 152 mpfi 2.68 3.47 9.0 154 5000 19 26 16500
## 4 54.3 2337 ohc four 109 mpfi 3.19 3.40 10.0 102 5500 24 30 13950
## 5 54.3 2824 ohc five 136 mpfi 3.19 3.40 8.0 115 5500 18 22 17450
## 6 53.1 2507 ohc five 136 mpfi 3.19 3.40 8.5 110 5500 19 25 15250
## 7 55.7 2844 ohc five 136 mpfi 3.19 3.40 8.5 110 5500 19 25 17710
## 8 55.7 2954 ohc five 136 mpfi 3.19 3.40 8.5 110 5500 19 25 18920
## 9 55.9 3086 ohc five 131 mpfi 3.13 3.40 8.3 140 5500 17 20 23875
## 10 52.0 3053 ohc five 131 mpfi 3.13 3.40 7.0 160 5500 16 22 ?
colnames(data) = c('symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration',
'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base',
'length', 'width', 'height', 'curb-weight', 'engine-type', 'num-of-cylinders',
'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-rate',
'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price')
continuous_values_cols = c('normalized-losses', 'wheel-base', 'length', 'width', 'height',
'curb-weight', 'bore', 'stroke', 'compression-rate', 'horsepower',
'peak-rpm', 'city-mpg', 'highway-mpg', 'price')
data = data[,continuous_values_cols]
head(data,10)
## normalized-losses wheel-base length width height curb-weight bore stroke
## 1 ? 88.6 168.8 64.1 48.8 2548 3.47 2.68
## 2 ? 88.6 168.8 64.1 48.8 2548 3.47 2.68
## 3 ? 94.5 171.2 65.5 52.4 2823 2.68 3.47
## 4 164 99.8 176.6 66.2 54.3 2337 3.19 3.40
## 5 164 99.4 176.6 66.4 54.3 2824 3.19 3.40
## 6 ? 99.8 177.3 66.3 53.1 2507 3.19 3.40
## 7 158 105.8 192.7 71.4 55.7 2844 3.19 3.40
## 8 ? 105.8 192.7 71.4 55.7 2954 3.19 3.40
## 9 158 105.8 192.7 71.4 55.9 3086 3.13 3.40
## 10 ? 99.5 178.2 67.9 52.0 3053 3.13 3.40
## compression-rate horsepower peak-rpm city-mpg highway-mpg price
## 1 9.0 111 5000 21 27 13495
## 2 9.0 111 5000 21 27 16500
## 3 9.0 154 5000 19 26 16500
## 4 10.0 102 5500 24 30 13950
## 5 8.0 115 5500 18 22 17450
## 6 8.5 110 5500 19 25 15250
## 7 8.5 110 5500 19 25 17710
## 8 8.5 110 5500 19 25 18920
## 9 8.3 140 5500 17 20 23875
## 10 7.0 160 5500 16 22 ?
data[data == '?'] = NA
colSums(is.na(data))
## normalized-losses wheel-base length width
## 41 0 0 0
## height curb-weight bore stroke
## 0 0 4 4
## compression-rate horsepower peak-rpm city-mpg
## 0 2 2 0
## highway-mpg price
## 0 4
which(is.na(data))
## [1] 1 2 3 6 8 10 15 16 17 18 44 45 46 47 49
## [16] 50 64 67 72 74 75 76 83 84 85 110 111 114 115 125
## [31] 127 128 129 130 131 132 182 190 192 193 194 1286 1287 1288 1289
## [46] 1491 1492 1493 1494 1976 1977 2181 2182 2675 2710 2711 2795
sum(is.na(data))
## [1] 57
library(tidyr)
data = drop_na(data, price) # o usando una 'pipe': data = data %>% drop_na(price)
colSums(is.na(data))
## normalized-losses wheel-base length width
## 37 0 0 0
## height curb-weight bore stroke
## 0 0 4 4
## compression-rate horsepower peak-rpm city-mpg
## 0 2 2 0
## highway-mpg price
## 0 0
summary(data)
## normalized-losses wheel-base length width
## Length:201 Min. : 86.6 Min. :141.1 Min. :60.30
## Class :character 1st Qu.: 94.5 1st Qu.:166.8 1st Qu.:64.10
## Mode :character Median : 97.0 Median :173.2 Median :65.50
## Mean : 98.8 Mean :174.2 Mean :65.89
## 3rd Qu.:102.4 3rd Qu.:183.5 3rd Qu.:66.60
## Max. :120.9 Max. :208.1 Max. :72.00
## height curb-weight bore stroke
## Min. :47.80 Min. :1488 Length:201 Length:201
## 1st Qu.:52.00 1st Qu.:2169 Class :character Class :character
## Median :54.10 Median :2414 Mode :character Mode :character
## Mean :53.77 Mean :2556
## 3rd Qu.:55.50 3rd Qu.:2926
## Max. :59.80 Max. :4066
## compression-rate horsepower peak-rpm city-mpg
## Min. : 7.00 Length:201 Length:201 Min. :13.00
## 1st Qu.: 8.60 Class :character Class :character 1st Qu.:19.00
## Median : 9.00 Mode :character Mode :character Median :24.00
## Mean :10.16 Mean :25.18
## 3rd Qu.: 9.40 3rd Qu.:30.00
## Max. :23.00 Max. :49.00
## highway-mpg price
## Min. :16.00 Length:201
## 1st Qu.:25.00 Class :character
## Median :30.00 Mode :character
## Mean :30.69
## 3rd Qu.:34.00
## Max. :54.00
data$`normalized-losses` = as.numeric(data$`normalized-losses`)
data$bore = as.numeric(data$bore)
data$stroke = as.numeric(data$stroke)
data$horsepower = as.numeric(data$horsepower)
data$`peak-rpm` = as.numeric(data$`peak-rpm`)
data$price = as.numeric(data$price)
summary(data)
## normalized-losses wheel-base length width
## Min. : 65 Min. : 86.6 Min. :141.1 Min. :60.30
## 1st Qu.: 94 1st Qu.: 94.5 1st Qu.:166.8 1st Qu.:64.10
## Median :115 Median : 97.0 Median :173.2 Median :65.50
## Mean :122 Mean : 98.8 Mean :174.2 Mean :65.89
## 3rd Qu.:150 3rd Qu.:102.4 3rd Qu.:183.5 3rd Qu.:66.60
## Max. :256 Max. :120.9 Max. :208.1 Max. :72.00
## NA's :37
## height curb-weight bore stroke
## Min. :47.80 Min. :1488 Min. :2.540 Min. :2.070
## 1st Qu.:52.00 1st Qu.:2169 1st Qu.:3.150 1st Qu.:3.110
## Median :54.10 Median :2414 Median :3.310 Median :3.290
## Mean :53.77 Mean :2556 Mean :3.331 Mean :3.257
## 3rd Qu.:55.50 3rd Qu.:2926 3rd Qu.:3.590 3rd Qu.:3.410
## Max. :59.80 Max. :4066 Max. :3.940 Max. :4.170
## NA's :4 NA's :4
## compression-rate horsepower peak-rpm city-mpg
## Min. : 7.00 Min. : 48.0 Min. :4150 Min. :13.00
## 1st Qu.: 8.60 1st Qu.: 70.0 1st Qu.:4800 1st Qu.:19.00
## Median : 9.00 Median : 95.0 Median :5200 Median :24.00
## Mean :10.16 Mean :103.4 Mean :5118 Mean :25.18
## 3rd Qu.: 9.40 3rd Qu.:116.0 3rd Qu.:5500 3rd Qu.:30.00
## Max. :23.00 Max. :262.0 Max. :6600 Max. :49.00
## NA's :2 NA's :2
## highway-mpg price
## Min. :16.00 Min. : 5118
## 1st Qu.:25.00 1st Qu.: 7775
## Median :30.00 Median :10295
## Mean :30.69 Mean :13207
## 3rd Qu.:34.00 3rd Qu.:16500
## Max. :54.00 Max. :45400
##
library(dplyr)
data = data %>% mutate_if(is.numeric, ~replace_na(.,mean(., na.rm = TRUE)))
colSums(is.na(data))
## normalized-losses wheel-base length width
## 0 0 0 0
## height curb-weight bore stroke
## 0 0 0 0
## compression-rate horsepower peak-rpm city-mpg
## 0 0 0 0
## highway-mpg price
## 0 0
target = data$price
data1 = sweep(sweep(data, 2, apply(data, 2, min)),2, apply(data, 2, max)-apply(data, 2, min), FUN = "/")
data1$price = target
summary(data1)
## normalized-losses wheel-base length width
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1885 1st Qu.:0.2303 1st Qu.:0.3836 1st Qu.:0.3248
## Median :0.2984 Median :0.3032 Median :0.4791 Median :0.4444
## Mean :0.2984 Mean :0.3556 Mean :0.4940 Mean :0.4777
## 3rd Qu.:0.3770 3rd Qu.:0.4606 3rd Qu.:0.6328 3rd Qu.:0.5385
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## height curb-weight bore stroke
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3500 1st Qu.:0.2642 1st Qu.:0.4357 1st Qu.:0.4952
## Median :0.5250 Median :0.3592 Median :0.5500 Median :0.5810
## Mean :0.4972 Mean :0.4141 Mean :0.5648 Mean :0.5652
## 3rd Qu.:0.6417 3rd Qu.:0.5578 3rd Qu.:0.7429 3rd Qu.:0.6381
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## compression-rate horsepower peak-rpm city-mpg
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1000 1st Qu.:0.1028 1st Qu.:0.2653 1st Qu.:0.1667
## Median :0.1250 Median :0.2196 Median :0.3949 Median :0.3056
## Mean :0.1978 Mean :0.2589 Mean :0.3949 Mean :0.3383
## 3rd Qu.:0.1500 3rd Qu.:0.3178 3rd Qu.:0.5510 3rd Qu.:0.4722
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## highway-mpg price
## Min. :0.0000 Min. : 5118
## 1st Qu.:0.2368 1st Qu.: 7775
## Median :0.3684 Median :10295
## Mean :0.3865 Mean :13207
## 3rd Qu.:0.4737 3rd Qu.:16500
## Max. :1.0000 Max. :45400
data2 = data
med = apply(data2, 2, min)
dev = apply(data2, 2, max)-apply(data2, 2, min)
data2 = as.data.frame(scale(data2, center = med, scale = dev))
data2$price = target
summary(data2)
## normalized-losses wheel-base length width
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1885 1st Qu.:0.2303 1st Qu.:0.3836 1st Qu.:0.3248
## Median :0.2984 Median :0.3032 Median :0.4791 Median :0.4444
## Mean :0.2984 Mean :0.3556 Mean :0.4940 Mean :0.4777
## 3rd Qu.:0.3770 3rd Qu.:0.4606 3rd Qu.:0.6328 3rd Qu.:0.5385
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## height curb-weight bore stroke
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3500 1st Qu.:0.2642 1st Qu.:0.4357 1st Qu.:0.4952
## Median :0.5250 Median :0.3592 Median :0.5500 Median :0.5810
## Mean :0.4972 Mean :0.4141 Mean :0.5648 Mean :0.5652
## 3rd Qu.:0.6417 3rd Qu.:0.5578 3rd Qu.:0.7429 3rd Qu.:0.6381
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## compression-rate horsepower peak-rpm city-mpg
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.1000 1st Qu.:0.1028 1st Qu.:0.2653 1st Qu.:0.1667
## Median :0.1250 Median :0.2196 Median :0.3949 Median :0.3056
## Mean :0.1978 Mean :0.2589 Mean :0.3949 Mean :0.3383
## 3rd Qu.:0.1500 3rd Qu.:0.3178 3rd Qu.:0.5510 3rd Qu.:0.4722
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## highway-mpg price
## Min. :0.0000 Min. : 5118
## 1st Qu.:0.2368 1st Qu.: 7775
## Median :0.3684 Median :10295
## Mean :0.3865 Mean :13207
## 3rd Qu.:0.4737 3rd Qu.:16500
## Max. :1.0000 Max. :45400
library(corrplot)
library(Hmisc)
corr = rcorr(as.matrix(data2))
corr$r
## normalized-losses wheel-base length width
## normalized-losses 1.00000000 -0.05666124 0.01942356 0.08680206
## wheel-base -0.05666124 1.00000000 0.87602389 0.81450665
## length 0.01942356 0.87602389 1.00000000 0.85717032
## width 0.08680206 0.81450665 0.85717032 1.00000000
## height -0.37373695 0.59074167 0.49206255 0.30600216
## curb-weight 0.09940425 0.78209724 0.88066479 0.86620110
## bore -0.02979985 0.49320299 0.60894083 0.54487909
## stroke 0.05512732 0.15796369 0.12391279 0.18881359
## compression-rate -0.11471325 0.25031309 0.15973311 0.18986712
## horsepower 0.21730000 0.37124988 0.57973062 0.61500603
## peak-rpm 0.23954380 -0.36023264 -0.28603534 -0.24585165
## city-mpg -0.22501573 -0.47060641 -0.66519239 -0.63353064
## highway-mpg -0.18187718 -0.54330447 -0.69814185 -0.68063521
## price 0.13399873 0.58464182 0.69062838 0.75126534
## height curb-weight bore stroke
## normalized-losses -0.37373695 0.09940425 -0.029799848 0.05512732
## wheel-base 0.59074167 0.78209724 0.493202986 0.15796369
## length 0.49206255 0.88066479 0.608940834 0.12391279
## width 0.30600216 0.86620110 0.544879092 0.18881359
## height 1.00000000 0.30758082 0.180326923 -0.06082202
## curb-weight 0.30758082 1.00000000 0.644040577 0.16741187
## bore 0.18032692 0.64404058 1.000000000 -0.05539001
## stroke -0.06082202 0.16741187 -0.055390011 1.00000000
## compression-rate 0.25973714 0.15643261 0.001249645 0.18785352
## horsepower -0.08694068 0.75799367 0.566837859 0.09781468
## peak-rpm -0.30991346 -0.27934961 -0.267338383 -0.06371957
## city-mpg -0.04979997 -0.74954309 -0.582121055 -0.03407894
## highway-mpg -0.10481184 -0.79488894 -0.591390045 -0.03474142
## price 0.13548631 0.83441453 0.543153766 0.08226710
## compression-rate horsepower peak-rpm city-mpg
## normalized-losses -0.114713246 0.21730000 0.23954380 -0.22501573
## wheel-base 0.250313088 0.37124988 -0.36023264 -0.47060641
## length 0.159733109 0.57973062 -0.28603534 -0.66519239
## width 0.189867118 0.61500603 -0.24585165 -0.63353064
## height 0.259737141 -0.08694068 -0.30991346 -0.04979997
## curb-weight 0.156432613 0.75799367 -0.27934961 -0.74954309
## bore 0.001249645 0.56683786 -0.26733838 -0.58212106
## stroke 0.187853516 0.09781468 -0.06371957 -0.03407894
## compression-rate 1.000000000 -0.21443063 -0.43572083 0.33142484
## horsepower -0.214430629 1.00000000 0.10788157 -0.82213847
## peak-rpm -0.435720829 0.10788157 1.00000000 -0.11535804
## city-mpg 0.331424839 -0.82213847 -0.11535804 1.00000000
## highway-mpg 0.268464848 -0.80458746 -0.05860516 0.97204371
## price 0.071107327 0.80968120 -0.10154203 -0.68657101
## highway-mpg price
## normalized-losses -0.18187718 0.13399873
## wheel-base -0.54330447 0.58464182
## length -0.69814185 0.69062838
## width -0.68063521 0.75126534
## height -0.10481184 0.13548631
## curb-weight -0.79488894 0.83441453
## bore -0.59139004 0.54315377
## stroke -0.03474142 0.08226710
## compression-rate 0.26846485 0.07110733
## horsepower -0.80458746 0.80968120
## peak-rpm -0.05860516 -0.10154203
## city-mpg 0.97204371 -0.68657101
## highway-mpg 1.00000000 -0.70469227
## price -0.70469227 1.00000000
corr$P[is.na(corr$P)] = 0
corrplot(corr$r, p.mat = corr$P, method = 'circle', order = 'hclust')
\(\textbf{Ejercicio 1:}\) Calcula la
correlación de normalized-losses y wheel-base
usando la definición de correlación entre dos vectores. \(\textbf{Ejercicio 2:}\) Define el volumen
con length*width*height y normaliza entre 1000. Grafica el
boxplot del volumen de acuerdo al estilo del carro.
plot(data2$horsepower,data$price, main = 'Price vs horsepower',
xlab = 'horsepower', ylab = 'price', pch = 16, col = 'blue', cex = 1.5)
height, en la descripción de la variable
price.
set.seed(1987)
dat.d = sample(1:nrow(data2),size=nrow(data2)*0.5,replace = FALSE)
entrenamiento = data2[dat.d,-14] # 50% de datos para entrenar
prueba = data2[-dat.d,-14] # 50% de datos para probar
precio_entrenamiento = target[dat.d]
precio_prueba = target[-dat.d]
library(class)
k1 = floor(sqrt(NROW(entrenamiento))) # Determinación empÃrica del número de vecinos
knn.k1 = knn(train=as.matrix(entrenamiento$height), test=as.matrix(prueba$height), cl=precio_entrenamiento, k=k1)
height.
x1 = as.vector(knn.k1)
plot(x1,precio_prueba)
r2score1 = rcorr(x1,precio_prueba)
r2score1
## x y
## x 1.00 0.16
## y 0.16 1.00
##
## n= 101
##
##
## P
## x y
## x 0.1035
## y 0.1035
r2score = 0
k = c(1,3,5,7,9,11,13,15,20)
r2score = data.frame()
for (j in 1:13){
for (i in 1:length(k)) {
r2score[i,j] = rcorr(as.vector(knn(train=as.matrix(entrenamiento[,j]),
test=as.matrix(prueba[,j]),
cl=precio_entrenamiento,
k=k[i])),precio_prueba)[[1]][1,2]
}
}
colnames(r2score) = colnames(data2)[-14]
re2score.graficar = data.frame(k = k, r2score)
library(reshape2)
graficar = melt(re2score.graficar, id = "k")
plot.k = ggplot(graficar,
aes(x = k,
y = value,
color = variable)) + geom_line()
plot.k
library(plotly)
ggplotly(plot.k)