##Caso de Estudio 1: Análisis del Dataset MTCars
# Cargar el dataset MTCars
data1 <- read.csv("https://raw.githubusercontent.com/geovannychoez/prueba/master/MTCars.csv", header = TRUE)
head(data1)
## car_ID symboling CarName fueltype aspiration doornumber
## 1 1 3 alfa-romero giulia gas std two
## 2 2 3 alfa-romero stelvio gas std two
## 3 3 1 alfa-romero Quadrifoglio gas std two
## 4 4 2 audi 100 ls gas std four
## 5 5 2 audi 100ls gas std four
## 6 6 2 audi fox gas std two
## carbody drivewheel enginelocation wheelbase carlength carwidth carheight
## 1 convertible rwd front 88.6 168.8 64.1 48.8
## 2 convertible rwd front 88.6 168.8 64.1 48.8
## 3 hatchback rwd front 94.5 171.2 65.5 52.4
## 4 sedan fwd front 99.8 176.6 66.2 54.3
## 5 sedan 4wd front 99.4 176.6 66.4 54.3
## 6 sedan fwd front 99.8 177.3 66.3 53.1
## curbweight enginetype cylindernumber enginesize fuelsystem boreratio stroke
## 1 2548 dohc four 130 mpfi 3.47 2.68
## 2 2548 dohc four 130 mpfi 3.47 2.68
## 3 2823 ohcv six 152 mpfi 2.68 3.47
## 4 2337 ohc four 109 mpfi 3.19 3.40
## 5 2824 ohc five 136 mpfi 3.19 3.40
## 6 2507 ohc five 136 mpfi 3.19 3.40
## compressionratio horsepower peakrpm citympg highwaympg price
## 1 9.0 111 5000 21 27 13495
## 2 9.0 111 5000 21 27 16500
## 3 9.0 154 5000 19 26 16500
## 4 10.0 102 5500 24 30 13950
## 5 8.0 115 5500 18 22 17450
## 6 8.5 110 5500 19 25 15250
## Calcular la matriz de correlación
library(corrplot)
## corrplot 0.94 loaded
data1_num <- data1[, sapply(data1, is.numeric)]
# Calcular la matriz de correlación
cor_matrix1 <- cor(data1_num)
## Visualizar la matriz de correlaciones inicial
corrplot(cor_matrix1, method = "circle")
# Seleccionar las variables que tienen correlación absoluta mayor a 0.5
# Crear una máscara para filtrar las correlaciones mayores a 0.5
important_vars <- which(apply(abs(cor_matrix1) > 0.5, 2, any))
# Filtrar el dataset para incluir solo las variables con correlaciones importantes
data_important_corr <- data1_num[, important_vars]
# Calcular la matriz de correlación solo para las variables importantes
cor_matrix_important <- cor(data_important_corr)
# Visualizar la matriz de correlación importante
corrplot(cor_matrix_important, method = "circle")
#Modelo Inicial
# Ajustar el modelo inicial de regresión lineal múltiple
model1 <- lm(price ~ horsepower + enginesize + curbweight, data = data1)
# Resumen del modelo inicial
summary(model1)
##
## Call:
## lm(formula = price ~ horsepower + enginesize + curbweight, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9003 -1701 -24 1340 13760
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.346e+04 1.333e+03 -10.100 < 2e-16 ***
## horsepower 4.875e+01 1.070e+01 4.557 8.99e-06 ***
## enginesize 8.488e+01 1.276e+01 6.651 2.69e-10 ***
## curbweight 4.263e+00 9.065e-01 4.702 4.78e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3473 on 201 degrees of freedom
## Multiple R-squared: 0.8138, Adjusted R-squared: 0.811
## F-statistic: 292.9 on 3 and 201 DF, p-value: < 2.2e-16
#Modelo Final
# eliminar variables no significativas
model_final1 <- step(model1)
## Start: AIC=3346.56
## price ~ horsepower + enginesize + curbweight
##
## Df Sum of Sq RSS AIC
## <none> 2423994702 3346.6
## - horsepower 1 250452350 2674447051 3364.7
## - curbweight 1 266633734 2690628436 3366.0
## - enginesize 1 533516727 2957511429 3385.3
# Resumen del modelo final
summary(model_final1)
##
## Call:
## lm(formula = price ~ horsepower + enginesize + curbweight, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9003 -1701 -24 1340 13760
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.346e+04 1.333e+03 -10.100 < 2e-16 ***
## horsepower 4.875e+01 1.070e+01 4.557 8.99e-06 ***
## enginesize 8.488e+01 1.276e+01 6.651 2.69e-10 ***
## curbweight 4.263e+00 9.065e-01 4.702 4.78e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3473 on 201 degrees of freedom
## Multiple R-squared: 0.8138, Adjusted R-squared: 0.811
## F-statistic: 292.9 on 3 and 201 DF, p-value: < 2.2e-16
##Caso de Estudio 2: Análisis del Dataset UsedCars
# Cargar el dataset UsedCars
data2 <- read.csv("https://raw.githubusercontent.com/geovannychoez/prueba/master/UsedCars.csv", header = TRUE)
head(data2)
## Make Model Price Year Kilometer
## 1 Honda Amaze 1.2 VX i-VTEC 505000 2017 87150
## 2 Maruti Suzuki Swift DZire VDI 450000 2014 75000
## 3 Hyundai i10 Magna 1.2 Kappa2 220000 2011 67000
## 4 Toyota Glanza G 799000 2019 37500
## 5 Toyota Innova 2.4 VX 7 STR [2016-2020] 1950000 2018 69000
## 6 Maruti Suzuki Ciaz ZXi 675000 2017 73315
## Fuel.Type Transmission Location Color Owner Seller.Type Engine
## 1 Petrol Manual Pune Grey First Corporate 1198 cc
## 2 Diesel Manual Ludhiana White Second Individual 1248 cc
## 3 Petrol Manual Lucknow Maroon First Individual 1197 cc
## 4 Petrol Manual Mangalore Red First Individual 1197 cc
## 5 Diesel Manual Mumbai Grey First Individual 2393 cc
## 6 Petrol Manual Pune Grey First Individual 1373 cc
## Max.Power Max.Torque Drivetrain Length Width Height
## 1 87 bhp @ 6000 rpm 109 Nm @ 4500 rpm FWD 3990 1680 1505
## 2 74 bhp @ 4000 rpm 190 Nm @ 2000 rpm FWD 3995 1695 1555
## 3 79 bhp @ 6000 rpm 112.7619 Nm @ 4000 rpm FWD 3585 1595 1550
## 4 82 bhp @ 6000 rpm 113 Nm @ 4200 rpm FWD 3995 1745 1510
## 5 148 bhp @ 3400 rpm 343 Nm @ 1400 rpm RWD 4735 1830 1795
## 6 91 bhp @ 6000 rpm 130 Nm @ 4000 rpm FWD 4490 1730 1485
## Seating.Capacity Fuel.Tank.Capacity
## 1 5 35
## 2 5 42
## 3 5 35
## 4 5 37
## 5 7 55
## 6 5 43
## Calcular la matriz de correlación
library(corrplot)
data2_num <- data2[, sapply(data2, is.numeric)]
# Calcular la matriz de correlación
cor_matrix2 <- cor(data2_num)
## Visualizar la matriz de correlaciones inicial
corrplot(cor_matrix2, method = "circle")
# Seleccionar las variables que tienen correlación absoluta mayor a 0.5
# Crear una máscara para filtrar las correlaciones mayores a 0.5
important_vars2 <- which(apply(abs(cor_matrix2) > 0.5, 2, any))
# Filtrar el dataset para incluir solo las variables con correlaciones importantes
data_important_corr2 <- data2_num[, important_vars2]
# Calcular la matriz de correlación solo para las variables importantes
cor_matrix_important2 <- cor(data_important_corr2)
# Visualizar la matriz de correlación importante
corrplot(cor_matrix_important2, method = "circle")
#Modelo Inicial
# Ajustar el modelo inicial de regresión lineal múltiple
model2 <- lm(Price ~ Kilometer + Seating.Capacity + Fuel.Tank.Capacity, data = data2)
# Resumen del modelo inicial
summary(model2)
##
## Call:
## lm(formula = Price ~ Kilometer + Seating.Capacity + Fuel.Tank.Capacity,
## data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5582265 -732770 -162588 383156 28135994
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.925e+04 2.913e+05 -0.135 0.893
## Kilometer -6.524e+00 7.336e-01 -8.894 <2e-16 ***
## Seating.Capacity -6.467e+05 5.621e+04 -11.505 <2e-16 ***
## Fuel.Tank.Capacity 1.059e+05 2.976e+03 35.575 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1855000 on 1870 degrees of freedom
## Multiple R-squared: 0.4166, Adjusted R-squared: 0.4157
## F-statistic: 445.1 on 3 and 1870 DF, p-value: < 2.2e-16
#Modelo Final
# eliminar variables no significativas
model_final2 <- step(model2)
## Start: AIC=54099.45
## Price ~ Kilometer + Seating.Capacity + Fuel.Tank.Capacity
##
## Df Sum of Sq RSS AIC
## <none> 6.4316e+15 54099
## - Kilometer 1 2.7204e+14 6.7036e+15 54175
## - Seating.Capacity 1 4.5528e+14 6.8869e+15 54226
## - Fuel.Tank.Capacity 1 4.3528e+15 1.0784e+16 55066
# Resumen del modelo final
summary(model_final2)
##
## Call:
## lm(formula = Price ~ Kilometer + Seating.Capacity + Fuel.Tank.Capacity,
## data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5582265 -732770 -162588 383156 28135994
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.925e+04 2.913e+05 -0.135 0.893
## Kilometer -6.524e+00 7.336e-01 -8.894 <2e-16 ***
## Seating.Capacity -6.467e+05 5.621e+04 -11.505 <2e-16 ***
## Fuel.Tank.Capacity 1.059e+05 2.976e+03 35.575 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1855000 on 1870 degrees of freedom
## Multiple R-squared: 0.4166, Adjusted R-squared: 0.4157
## F-statistic: 445.1 on 3 and 1870 DF, p-value: < 2.2e-16