#Se importa la base a ser analizada:
data=read.csv(url("https://raw.githubusercontent.com/geovannychoez/prueba/master/MTCars.csv"), header = TRUE)
View(data)
names(data)
## [1] "car_ID" "symboling" "CarName" "fueltype"
## [5] "aspiration" "doornumber" "carbody" "drivewheel"
## [9] "enginelocation" "wheelbase" "carlength" "carwidth"
## [13] "carheight" "curbweight" "enginetype" "cylindernumber"
## [17] "enginesize" "fuelsystem" "boreratio" "stroke"
## [21] "compressionratio" "horsepower" "peakrpm" "citympg"
## [25] "highwaympg" "price"
str(data)
## 'data.frame': 205 obs. of 26 variables:
## $ car_ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ symboling : int 3 3 1 2 2 2 1 1 1 0 ...
## $ CarName : chr "alfa-romero giulia" "alfa-romero stelvio" "alfa-romero Quadrifoglio" "audi 100 ls" ...
## $ fueltype : chr "gas" "gas" "gas" "gas" ...
## $ aspiration : chr "std" "std" "std" "std" ...
## $ doornumber : chr "two" "two" "two" "four" ...
## $ carbody : chr "convertible" "convertible" "hatchback" "sedan" ...
## $ drivewheel : chr "rwd" "rwd" "rwd" "fwd" ...
## $ enginelocation : chr "front" "front" "front" "front" ...
## $ wheelbase : num 88.6 88.6 94.5 99.8 99.4 ...
## $ carlength : num 169 169 171 177 177 ...
## $ carwidth : num 64.1 64.1 65.5 66.2 66.4 66.3 71.4 71.4 71.4 67.9 ...
## $ carheight : num 48.8 48.8 52.4 54.3 54.3 53.1 55.7 55.7 55.9 52 ...
## $ curbweight : int 2548 2548 2823 2337 2824 2507 2844 2954 3086 3053 ...
## $ enginetype : chr "dohc" "dohc" "ohcv" "ohc" ...
## $ cylindernumber : chr "four" "four" "six" "four" ...
## $ enginesize : int 130 130 152 109 136 136 136 136 131 131 ...
## $ fuelsystem : chr "mpfi" "mpfi" "mpfi" "mpfi" ...
## $ boreratio : num 3.47 3.47 2.68 3.19 3.19 3.19 3.19 3.19 3.13 3.13 ...
## $ stroke : num 2.68 2.68 3.47 3.4 3.4 3.4 3.4 3.4 3.4 3.4 ...
## $ compressionratio: num 9 9 9 10 8 8.5 8.5 8.5 8.3 7 ...
## $ horsepower : int 111 111 154 102 115 110 110 110 140 160 ...
## $ peakrpm : int 5000 5000 5000 5500 5500 5500 5500 5500 5500 5500 ...
## $ citympg : int 21 21 19 24 18 19 19 19 17 16 ...
## $ highwaympg : int 27 27 26 30 22 25 25 25 20 22 ...
## $ price : num 13495 16500 16500 13950 17450 ...
#De las 26 variables 10 no son numéricas, por tanto no se las considera para la matriz de correlacion
data2=read.csv(url("https://raw.githubusercontent.com/geovannychoez/prueba/master/UsedCars.csv"), header = TRUE)
View(data2)
names(data2)
## [1] "Make" "Model" "Price"
## [4] "Year" "Kilometer" "Fuel.Type"
## [7] "Transmission" "Location" "Color"
## [10] "Owner" "Seller.Type" "Engine"
## [13] "Max.Power" "Max.Torque" "Drivetrain"
## [16] "Length" "Width" "Height"
## [19] "Seating.Capacity" "Fuel.Tank.Capacity"
str(data2)
## 'data.frame': 1874 obs. of 20 variables:
## $ Make : chr "Honda" "Maruti Suzuki" "Hyundai" "Toyota" ...
## $ Model : chr "Amaze 1.2 VX i-VTEC" "Swift DZire VDI" "i10 Magna 1.2 Kappa2" "Glanza G" ...
## $ Price : int 505000 450000 220000 799000 1950000 675000 2650000 1390000 575000 591000 ...
## $ Year : int 2017 2014 2011 2019 2018 2017 2017 2017 2015 2017 ...
## $ Kilometer : int 87150 75000 67000 37500 69000 73315 75000 56000 85000 20281 ...
## $ Fuel.Type : chr "Petrol" "Diesel" "Petrol" "Petrol" ...
## $ Transmission : chr "Manual" "Manual" "Manual" "Manual" ...
## $ Location : chr "Pune" "Ludhiana" "Lucknow" "Mangalore" ...
## $ Color : chr "Grey" "White" "Maroon" "Red" ...
## $ Owner : chr "First" "Second" "First" "First" ...
## $ Seller.Type : chr "Corporate" "Individual" "Individual" "Individual" ...
## $ Engine : chr "1198 cc" "1248 cc" "1197 cc" "1197 cc" ...
## $ Max.Power : chr "87 bhp @ 6000 rpm" "74 bhp @ 4000 rpm" "79 bhp @ 6000 rpm" "82 bhp @ 6000 rpm" ...
## $ Max.Torque : chr "109 Nm @ 4500 rpm" "190 Nm @ 2000 rpm" "112.7619 Nm @ 4000 rpm" "113 Nm @ 4200 rpm" ...
## $ Drivetrain : chr "FWD" "FWD" "FWD" "FWD" ...
## $ Length : int 3990 3995 3585 3995 4735 4490 4439 4670 4331 3985 ...
## $ Width : int 1680 1695 1595 1745 1830 1730 1821 1814 1822 1734 ...
## $ Height : int 1505 1555 1550 1510 1795 1485 1612 1476 1671 1505 ...
## $ Seating.Capacity : int 5 5 5 5 7 5 5 5 5 5 ...
## $ Fuel.Tank.Capacity: num 35 42 35 37 55 43 51 50 50 45 ...
#De las 20 variables 12 no son numéricas, por tanto no se las considera para la matriz de correlacion
#Se importan librerias para graficas
knitr::opts_chunk$set(echo = TRUE)
library(highcharter)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
if (!requireNamespace("corrplot", quietly = TRUE)) {
install.packages("corrplot")
}
library(corrplot)
## corrplot 0.92 loaded
#La razón se debe a que este dataframe excluye variables no numéricas para poder graficar matriz de correlacion, fundamental para empezar el modelo de regresion.
variables_numericas_data= data[,c(1:2,10:14,17,19:26)]
View(variables_numericas_data)
names(variables_numericas_data)
## [1] "car_ID" "symboling" "wheelbase" "carlength"
## [5] "carwidth" "carheight" "curbweight" "enginesize"
## [9] "boreratio" "stroke" "compressionratio" "horsepower"
## [13] "peakrpm" "citympg" "highwaympg" "price"
matriz_corr_data <- cor(variables_numericas_data)
matriz_corr_data
## car_ID symboling wheelbase carlength carwidth
## car_ID 1.00000000 -0.151621137 0.1297288 0.1706364 0.05238661
## symboling -0.15162114 1.000000000 -0.5319537 -0.3576115 -0.23291906
## wheelbase 0.12972878 -0.531953682 1.0000000 0.8745875 0.79514364
## carlength 0.17063639 -0.357611523 0.8745875 1.0000000 0.84111827
## carwidth 0.05238661 -0.232919061 0.7951436 0.8411183 1.00000000
## carheight 0.25596004 -0.541038200 0.5894348 0.4910295 0.27921032
## curbweight 0.07196156 -0.227690588 0.7763863 0.8777285 0.86703246
## enginesize -0.03392984 -0.105789709 0.5693287 0.6833599 0.73543340
## boreratio 0.26006368 -0.130051360 0.4887499 0.6064544 0.55914991
## stroke -0.16082362 -0.008735141 0.1609590 0.1295326 0.18294169
## compressionratio 0.15027591 -0.178515084 0.2497858 0.1584137 0.18112863
## horsepower -0.01500557 0.070872724 0.3532945 0.5526230 0.64073208
## peakrpm -0.20378920 0.273606245 -0.3604687 -0.2872422 -0.22001230
## citympg 0.01594004 -0.035822628 -0.4704136 -0.6709087 -0.64270434
## highwaympg 0.01125532 0.034606001 -0.5440819 -0.7046616 -0.67721792
## price -0.10909334 -0.079978225 0.5778156 0.6829200 0.75932530
## carheight curbweight enginesize boreratio stroke
## car_ID 0.25596004 0.07196156 -0.03392984 0.260063680 -0.160823619
## symboling -0.54103820 -0.22769059 -0.10578971 -0.130051360 -0.008735141
## wheelbase 0.58943476 0.77638633 0.56932868 0.488749875 0.160959047
## carlength 0.49102946 0.87772846 0.68335987 0.606454358 0.129532611
## carwidth 0.27921032 0.86703246 0.73543340 0.559149909 0.182941693
## carheight 1.00000000 0.29557173 0.06714874 0.171070922 -0.055306674
## curbweight 0.29557173 1.00000000 0.85059407 0.648479749 0.168790035
## enginesize 0.06714874 0.85059407 1.00000000 0.583774327 0.203128588
## boreratio 0.17107092 0.64847975 0.58377433 1.000000000 -0.055908983
## stroke -0.05530667 0.16879004 0.20312859 -0.055908983 1.000000000
## compressionratio 0.26121423 0.15136174 0.02897136 0.005197339 0.186110110
## horsepower -0.10880206 0.75073925 0.80976865 0.573676823 0.080939536
## peakrpm -0.32041072 -0.26624318 -0.24465983 -0.254975528 -0.067963753
## citympg -0.04863963 -0.75741378 -0.65365792 -0.584531716 -0.042144754
## highwaympg -0.10735763 -0.79746479 -0.67746991 -0.587011784 -0.043930930
## price 0.11933623 0.83530488 0.87414480 0.553173237 0.079443084
## compressionratio horsepower peakrpm citympg
## car_ID 0.150275906 -0.01500557 -0.20378920 0.01594004
## symboling -0.178515084 0.07087272 0.27360625 -0.03582263
## wheelbase 0.249785845 0.35329448 -0.36046875 -0.47041361
## carlength 0.158413706 0.55262297 -0.28724220 -0.67090866
## carwidth 0.181128627 0.64073208 -0.22001230 -0.64270434
## carheight 0.261214226 -0.10880206 -0.32041072 -0.04863963
## curbweight 0.151361740 0.75073925 -0.26624318 -0.75741378
## enginesize 0.028971360 0.80976865 -0.24465983 -0.65365792
## boreratio 0.005197339 0.57367682 -0.25497553 -0.58453172
## stroke 0.186110110 0.08093954 -0.06796375 -0.04214475
## compressionratio 1.000000000 -0.20432623 -0.43574051 0.32470142
## horsepower -0.204326226 1.00000000 0.13107251 -0.80145618
## peakrpm -0.435740514 0.13107251 1.00000000 -0.11354438
## citympg 0.324701425 -0.80145618 -0.11354438 1.00000000
## highwaympg 0.265201389 -0.77054389 -0.05427481 0.97133704
## price 0.067983506 0.80813882 -0.08526715 -0.68575134
## highwaympg price
## car_ID 0.01125532 -0.10909334
## symboling 0.03460600 -0.07997822
## wheelbase -0.54408192 0.57781560
## carlength -0.70466160 0.68292002
## carwidth -0.67721792 0.75932530
## carheight -0.10735763 0.11933623
## curbweight -0.79746479 0.83530488
## enginesize -0.67746991 0.87414480
## boreratio -0.58701178 0.55317324
## stroke -0.04393093 0.07944308
## compressionratio 0.26520139 0.06798351
## horsepower -0.77054389 0.80813882
## peakrpm -0.05427481 -0.08526715
## citympg 0.97133704 -0.68575134
## highwaympg 1.00000000 -0.69759909
## price -0.69759909 1.00000000
#La razón se debe a que este dataframe excluye variables no numéricas para poder graficar matriz de correlacion, fundamental para empezar el modelo de regresion.
variables_numericas_data2= data2[,c(3:5,16:20)]
View(variables_numericas_data2)
names(variables_numericas_data2)
## [1] "Price" "Year" "Kilometer"
## [4] "Length" "Width" "Height"
## [7] "Seating.Capacity" "Fuel.Tank.Capacity"
matriz_corr_data2 <- cor(variables_numericas_data2)
matriz_corr_data2
## Price Year Kilometer Length
## Price 1.00000000 0.3093808616 -0.147276151 0.56887490
## Year 0.30938086 1.0000000000 -0.291739924 0.08517813
## Kilometer -0.14727615 -0.2917399243 1.000000000 0.03781718
## Length 0.56887490 0.0851781331 0.037817177 1.00000000
## Width 0.57709978 0.1822201159 0.008479363 0.79722805
## Height 0.09296771 0.1268485819 0.085727621 0.19470421
## Seating.Capacity -0.02487942 -0.0001130477 0.111102936 0.29852678
## Fuel.Tank.Capacity 0.58610949 0.0448594975 0.052446915 0.80981209
## Width Height Seating.Capacity Fuel.Tank.Capacity
## Price 0.577099782 0.09296771 -0.0248794220 0.58610949
## Year 0.182220116 0.12684858 -0.0001130477 0.04485950
## Kilometer 0.008479363 0.08572762 0.1111029359 0.05244692
## Length 0.797228050 0.19470421 0.2985267764 0.80981209
## Width 1.000000000 0.32692755 0.2290563388 0.79131609
## Height 0.326927546 1.00000000 0.6953727610 0.40871653
## Seating.Capacity 0.229056339 0.69537276 1.0000000000 0.31392753
## Fuel.Tank.Capacity 0.791316085 0.40871653 0.3139275291 1.00000000
corrplot(matriz_corr_data,
method = "color",
type = "upper",
tl.col = "black",
tl.srt = 45, # Rotación de los nombres
tl.cex = 0.8, # Tamaño del texto de las etiquetas
title = "Matriz de Correlación: Data",
addCoef.col = "black",
number.cex = 0.7) # Tamaño del texto de los coeficientes
#Variable “Price”, se relaciona positivamente de manera significativa
(>0.5) con 7 variables y negativamente de manera significativa
(<-0.5) con 2 variables
#wheelbase, carlength, carwidth, curbweigh, enginesize, boreratio, horsepower,citympg, highwaympg
corrplot(matriz_corr_data2,
method = "color",
type = "upper",
tl.col = "black",
tl.srt = 45, # Rotación de los nombres
tl.cex = 0.8, # Tamaño del texto de las etiquetas
title = "Matriz de Correlación: Data2",
addCoef.col = "black",
number.cex = 0.7) # Tamaño del texto de los coeficientes
#Variable “Price”, se relaciona positivamente de manera significativa
(>0.5) con 3 variables
#Length, Width, Fuel.Tank.Capacity
#Para facilitar la escritura se renombra matriz
n_data <- variables_numericas_data
n_data2 <- variables_numericas_data2
View(n_data)
modelo_lineal_Data <- lm(
n_data$price ~ n_data$wheelbase+n_data$carlength+n_data$carwidth+n_data$curbweigh+n_data$enginesize+n_data$boreratio+n_data$horsepower+n_data$citympg+n_data$highwaympg)
modelo_lineal_Data
##
## Call:
## lm(formula = n_data$price ~ n_data$wheelbase + n_data$carlength +
## n_data$carwidth + n_data$curbweigh + n_data$enginesize +
## n_data$boreratio + n_data$horsepower + n_data$citympg + n_data$highwaympg)
##
## Coefficients:
## (Intercept) n_data$wheelbase n_data$carlength n_data$carwidth
## -43223.926 109.895 -57.853 532.467
## n_data$curbweigh n_data$enginesize n_data$boreratio n_data$horsepower
## 2.918 83.615 -1140.464 53.754
## n_data$citympg n_data$highwaympg
## -119.815 122.855
#Se obtiene: Y=B0+B1(n_data\(wheelbase)+B2(n_data\)carlength)+B3(n_data\(carwidth)+B4(n_data\)curbweigh)+B5(n_data\(enginesize)+B6(n_data\)boreratio)+B7(n_data\(horsepower)+B8(n_data\)citympg)+B9(n_data$highwaympg)
#B0= -43223.926 | B1= 109.895 | B2= -57.853 | B3= 532.467 | B4= 2.918 | B5= 83.615 | B6= -1140.464 | B7= 53.754 | B8= -119.815 | B9= 122.855
modelo_lineal_Data2 <- lm(
n_data2$Price ~ n_data2$Length+n_data2$Width+n_data2$Fuel.Tank.Capacity)
modelo_lineal_Data2
##
## Call:
## lm(formula = n_data2$Price ~ n_data2$Length + n_data2$Width +
## n_data2$Fuel.Tank.Capacity)
##
## Coefficients:
## (Intercept) n_data2$Length
## -1.210e+07 9.343e+02
## n_data2$Width n_data2$Fuel.Tank.Capacity
## 4.300e+03 4.252e+04
#Se obtiene: Y=B0+B1(n_data2\(Length+B2(n_data2\)Width)+B3(n_data2$Fuel.Tank.Capacity)
#B0= -1.210e+07 | B1= 9.343e+02 | B2= 4.300e+03 | B3= 4.252e+04
summary(modelo_lineal_Data)
##
## Call:
## lm(formula = n_data$price ~ n_data$wheelbase + n_data$carlength +
## n_data$carwidth + n_data$curbweigh + n_data$enginesize +
## n_data$boreratio + n_data$horsepower + n_data$citympg + n_data$highwaympg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8297 -1542 31 1307 14444
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -43223.926 13760.207 -3.141 0.001944 **
## n_data$wheelbase 109.895 101.743 1.080 0.281418
## n_data$carlength -57.853 57.933 -0.999 0.319217
## n_data$carwidth 532.467 255.561 2.084 0.038508 *
## n_data$curbweigh 2.918 1.651 1.767 0.078749 .
## n_data$enginesize 83.615 13.434 6.224 2.91e-09 ***
## n_data$boreratio -1140.464 1212.508 -0.941 0.348083
## n_data$horsepower 53.754 15.447 3.480 0.000619 ***
## n_data$citympg -119.815 185.536 -0.646 0.519183
## n_data$highwaympg 122.855 170.820 0.719 0.472872
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3444 on 195 degrees of freedom
## Multiple R-squared: 0.8223, Adjusted R-squared: 0.8141
## F-statistic: 100.3 on 9 and 195 DF, p-value: < 2.2e-16
#Con p-value: < 2.2e-16 menor a 0.05, RECHAZO H0: (b1=b2…=bp=0)
#Si Pr(>|t|) es menor a significancia, RECHAZO H0: bi=0
#0.001944 es menor 0.025 (el intercepto no afecta a los estimadores del modelo)
#0.281418 es mayor 0.025 NO RECHAZO H0
#0.319217 es mayor 0.025 NO RECHAZO H0
#0.038508 es mayor 0.025 NO RECHAZO H0
#0.078749 es mayor 0.025 NO RECHAZO H0
#2.91e-09 es menor 0.025 RECHAZO H0
#0.348083 es mayor 0.025 NO RECHAZO H0
#0.000619 es menor 0.025 RECHAZO H0
#0.519183 es mayor 0.025 NO RECHAZO H0
#0.472872 es mayor 0.025 NO RECHAZO H0
#LOS DATOS SE AJUSTAN Adjusted R-squared: 0.8141 AL MODELO, POR TANTO EL MODELO ES LO SUFICIENTEMENTE FIABLE PARA LAS PREVISIONES FUTURAS
#El modelo de muestra de regresion lineal mùltiple indica que las variables seleccionadas, explican a “Y” y sì logran predecirla
modelo_lineal_Data_intento2 <- lm(
n_data$price ~ n_data$enginesize+n_data$horsepower)
modelo_lineal_Data_intento2
##
## Call:
## lm(formula = n_data$price ~ n_data$enginesize + n_data$horsepower)
##
## Coefficients:
## (Intercept) n_data$enginesize n_data$horsepower
## -8389.73 122.45 58.85
summary(modelo_lineal_Data_intento2)
##
## Call:
## lm(formula = n_data$price ~ n_data$enginesize + n_data$horsepower)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10946.0 -1946.7 -218.8 1775.5 13403.1
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -8389.73 822.53 -10.200 < 2e-16 ***
## n_data$enginesize 122.45 10.46 11.709 < 2e-16 ***
## n_data$horsepower 58.85 11.01 5.344 2.45e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3650 on 202 degrees of freedom
## Multiple R-squared: 0.7933, Adjusted R-squared: 0.7913
## F-statistic: 387.7 on 2 and 202 DF, p-value: < 2.2e-16
#En este segundo intento nos quedamos con las variables “enginesize” y “horsepower” que explican y predicen significativamente a “price”
#Con p-value: < 2.2e-16 menor a 0.05, RECHAZO H0: (b1=b2…=bp=0)
#Si Pr(>|t|) es menor a significancia, RECHAZO H0: bi=0
#2e-16 es menor 0.025 (el intercepto no afecta a los estimadores del modelo)
#< 2e-16 es menor 0.025 RECHAZO H0
#2.45e-07 es menor 0.025 RECHAZO H0
#LOS DATOS SE AJUSTAN Adjusted R-squared: 0.7913 AL MODELO, POR TANTO EL MODELO ES FIABLE PARA LAS PREVISIONES FUTURAS, MÀS NO COMO EN EL PRIMER INTENTO
#El modelo de muestra de regresion lineal mùltiple indica que las variables seleccionadas, explican a “Y” y sì logran predecirla, aunque con menos exactitud que el primer modelo
summary(modelo_lineal_Data2)
##
## Call:
## lm(formula = n_data2$Price ~ n_data2$Length + n_data2$Width +
## n_data2$Fuel.Tank.Capacity)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4226517 -778163 -216094 412501 31125964
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.210e+07 8.158e+05 -14.838 < 2e-16 ***
## n_data2$Length 9.343e+02 1.909e+02 4.894 1.07e-06 ***
## n_data2$Width 4.300e+03 6.084e+02 7.068 2.22e-12 ***
## n_data2$Fuel.Tank.Capacity 4.252e+04 5.421e+03 7.844 7.30e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1903000 on 1870 degrees of freedom
## Multiple R-squared: 0.3857, Adjusted R-squared: 0.3847
## F-statistic: 391.4 on 3 and 1870 DF, p-value: < 2.2e-16
#Con p-value: < 2.2e-16 menor a 0.05, RECHAZO H0: (b1=b2…=bp=0)
#Si Pr(>|t|) es menor a significancia, RECHAZO H0: bi=0
#2e-16 es menor 0.025 (el intercepto no afecta a los estimadores del modelo)
#1.07e-06 es menor 0.025 RECHAZO H0
#2.22e-12 es menor 0.025 RECHAZO H0
#7.30e-15 es menor 0.025 RECHAZO H0
#LOS DATOS SE AJUSTAN Adjusted R-squared: 0.3847 AL MODELO, POR TANTO EL MODELO NO ES LO SUFICIENTEMENTE FIABLE PARA LAS PREVISIONES FUTURAS
#El modelo de muestra de regresion lineal mùltiple indica que las variables seleccionadas, explican a “Y” y no logran predecirla