R Markdown

##Caso de Estudio 1: Análisis del Dataset MTCars

# Cargar el dataset MTCars
data1 <- read.csv("https://raw.githubusercontent.com/geovannychoez/prueba/master/MTCars.csv", header = TRUE)
head(data1)
##   car_ID symboling                  CarName fueltype aspiration doornumber
## 1      1         3       alfa-romero giulia      gas        std        two
## 2      2         3      alfa-romero stelvio      gas        std        two
## 3      3         1 alfa-romero Quadrifoglio      gas        std        two
## 4      4         2              audi 100 ls      gas        std       four
## 5      5         2               audi 100ls      gas        std       four
## 6      6         2                 audi fox      gas        std        two
##       carbody drivewheel enginelocation wheelbase carlength carwidth carheight
## 1 convertible        rwd          front      88.6     168.8     64.1      48.8
## 2 convertible        rwd          front      88.6     168.8     64.1      48.8
## 3   hatchback        rwd          front      94.5     171.2     65.5      52.4
## 4       sedan        fwd          front      99.8     176.6     66.2      54.3
## 5       sedan        4wd          front      99.4     176.6     66.4      54.3
## 6       sedan        fwd          front      99.8     177.3     66.3      53.1
##   curbweight enginetype cylindernumber enginesize fuelsystem boreratio stroke
## 1       2548       dohc           four        130       mpfi      3.47   2.68
## 2       2548       dohc           four        130       mpfi      3.47   2.68
## 3       2823       ohcv            six        152       mpfi      2.68   3.47
## 4       2337        ohc           four        109       mpfi      3.19   3.40
## 5       2824        ohc           five        136       mpfi      3.19   3.40
## 6       2507        ohc           five        136       mpfi      3.19   3.40
##   compressionratio horsepower peakrpm citympg highwaympg price
## 1              9.0        111    5000      21         27 13495
## 2              9.0        111    5000      21         27 16500
## 3              9.0        154    5000      19         26 16500
## 4             10.0        102    5500      24         30 13950
## 5              8.0        115    5500      18         22 17450
## 6              8.5        110    5500      19         25 15250

Matriz Gráfica de Correlaciones

Matriz Inicial

## Calcular la matriz de correlación
library(corrplot)
## corrplot 0.94 loaded
data1_num <- data1[, sapply(data1, is.numeric)]
# Calcular la matriz de correlación
cor_matrix1 <- cor(data1_num)
## Visualizar la matriz de correlaciones inicial
corrplot(cor_matrix1, method = "circle")

Matriz Final (Correlaciones mas Importantes)

# Seleccionar las variables que tienen correlación absoluta mayor a 0.5
# Crear una máscara para filtrar las correlaciones mayores a 0.5
important_vars <- which(apply(abs(cor_matrix1) > 0.5, 2, any))

# Filtrar el dataset para incluir solo las variables con correlaciones importantes
data_important_corr <- data1_num[, important_vars]

# Calcular la matriz de correlación solo para las variables importantes
cor_matrix_important <- cor(data_important_corr)

# Visualizar la matriz de correlación importante
corrplot(cor_matrix_important, method = "circle")

Modelo de Regresión Lineal Múltiple

#Modelo Inicial

# Ajustar el modelo inicial de regresión lineal múltiple
model1 <- lm(price ~ horsepower + enginesize + curbweight, data = data1)

# Resumen del modelo inicial
summary(model1)
## 
## Call:
## lm(formula = price ~ horsepower + enginesize + curbweight, data = data1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -9003  -1701    -24   1340  13760 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.346e+04  1.333e+03 -10.100  < 2e-16 ***
## horsepower   4.875e+01  1.070e+01   4.557 8.99e-06 ***
## enginesize   8.488e+01  1.276e+01   6.651 2.69e-10 ***
## curbweight   4.263e+00  9.065e-01   4.702 4.78e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3473 on 201 degrees of freedom
## Multiple R-squared:  0.8138, Adjusted R-squared:  0.811 
## F-statistic: 292.9 on 3 and 201 DF,  p-value: < 2.2e-16

#Modelo Final

# eliminar variables no significativas
model_final1 <- step(model1)
## Start:  AIC=3346.56
## price ~ horsepower + enginesize + curbweight
## 
##              Df Sum of Sq        RSS    AIC
## <none>                    2423994702 3346.6
## - horsepower  1 250452350 2674447051 3364.7
## - curbweight  1 266633734 2690628436 3366.0
## - enginesize  1 533516727 2957511429 3385.3
# Resumen del modelo final
summary(model_final1)
## 
## Call:
## lm(formula = price ~ horsepower + enginesize + curbweight, data = data1)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
##  -9003  -1701    -24   1340  13760 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.346e+04  1.333e+03 -10.100  < 2e-16 ***
## horsepower   4.875e+01  1.070e+01   4.557 8.99e-06 ***
## enginesize   8.488e+01  1.276e+01   6.651 2.69e-10 ***
## curbweight   4.263e+00  9.065e-01   4.702 4.78e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3473 on 201 degrees of freedom
## Multiple R-squared:  0.8138, Adjusted R-squared:  0.811 
## F-statistic: 292.9 on 3 and 201 DF,  p-value: < 2.2e-16

##Caso de Estudio 2: Análisis del Dataset UsedCars

# Cargar el dataset UsedCars
data2 <- read.csv("https://raw.githubusercontent.com/geovannychoez/prueba/master/UsedCars.csv", header = TRUE)
head(data2)
##            Make                           Model   Price Year Kilometer
## 1         Honda             Amaze 1.2 VX i-VTEC  505000 2017     87150
## 2 Maruti Suzuki                 Swift DZire VDI  450000 2014     75000
## 3       Hyundai            i10 Magna 1.2 Kappa2  220000 2011     67000
## 4        Toyota                        Glanza G  799000 2019     37500
## 5        Toyota Innova 2.4 VX 7 STR [2016-2020] 1950000 2018     69000
## 6 Maruti Suzuki                        Ciaz ZXi  675000 2017     73315
##   Fuel.Type Transmission  Location  Color  Owner Seller.Type  Engine
## 1    Petrol       Manual      Pune   Grey  First   Corporate 1198 cc
## 2    Diesel       Manual  Ludhiana  White Second  Individual 1248 cc
## 3    Petrol       Manual   Lucknow Maroon  First  Individual 1197 cc
## 4    Petrol       Manual Mangalore    Red  First  Individual 1197 cc
## 5    Diesel       Manual    Mumbai   Grey  First  Individual 2393 cc
## 6    Petrol       Manual      Pune   Grey  First  Individual 1373 cc
##            Max.Power             Max.Torque Drivetrain Length Width Height
## 1  87 bhp @ 6000 rpm      109 Nm @ 4500 rpm        FWD   3990  1680   1505
## 2  74 bhp @ 4000 rpm      190 Nm @ 2000 rpm        FWD   3995  1695   1555
## 3  79 bhp @ 6000 rpm 112.7619 Nm @ 4000 rpm        FWD   3585  1595   1550
## 4  82 bhp @ 6000 rpm      113 Nm @ 4200 rpm        FWD   3995  1745   1510
## 5 148 bhp @ 3400 rpm      343 Nm @ 1400 rpm        RWD   4735  1830   1795
## 6  91 bhp @ 6000 rpm      130 Nm @ 4000 rpm        FWD   4490  1730   1485
##   Seating.Capacity Fuel.Tank.Capacity
## 1                5                 35
## 2                5                 42
## 3                5                 35
## 4                5                 37
## 5                7                 55
## 6                5                 43

Matriz Gráfica de Correlaciones

Matriz Inicial

## Calcular la matriz de correlación
library(corrplot)
data2_num <- data2[, sapply(data2, is.numeric)]
# Calcular la matriz de correlación
cor_matrix2 <- cor(data2_num)
## Visualizar la matriz de correlaciones inicial
corrplot(cor_matrix2, method = "circle")

Matriz Final (Correlaciones mas Importantes)

# Seleccionar las variables que tienen correlación absoluta mayor a 0.5
# Crear una máscara para filtrar las correlaciones mayores a 0.5
important_vars2 <- which(apply(abs(cor_matrix2) > 0.5, 2, any))

# Filtrar el dataset para incluir solo las variables con correlaciones importantes
data_important_corr2 <- data2_num[, important_vars2]

# Calcular la matriz de correlación solo para las variables importantes
cor_matrix_important2 <- cor(data_important_corr2)

# Visualizar la matriz de correlación importante
corrplot(cor_matrix_important2, method = "circle")

Modelo de Regresión Lineal Múltiple

#Modelo Inicial

# Ajustar el modelo inicial de regresión lineal múltiple
model2 <- lm(Price ~ Kilometer + Seating.Capacity + Fuel.Tank.Capacity, data = data2)

# Resumen del modelo inicial
summary(model2)
## 
## Call:
## lm(formula = Price ~ Kilometer + Seating.Capacity + Fuel.Tank.Capacity, 
##     data = data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -5582265  -732770  -162588   383156 28135994 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -3.925e+04  2.913e+05  -0.135    0.893    
## Kilometer          -6.524e+00  7.336e-01  -8.894   <2e-16 ***
## Seating.Capacity   -6.467e+05  5.621e+04 -11.505   <2e-16 ***
## Fuel.Tank.Capacity  1.059e+05  2.976e+03  35.575   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1855000 on 1870 degrees of freedom
## Multiple R-squared:  0.4166, Adjusted R-squared:  0.4157 
## F-statistic: 445.1 on 3 and 1870 DF,  p-value: < 2.2e-16

#Modelo Final

# eliminar variables no significativas
model_final2 <- step(model2)
## Start:  AIC=54099.45
## Price ~ Kilometer + Seating.Capacity + Fuel.Tank.Capacity
## 
##                      Df  Sum of Sq        RSS   AIC
## <none>                             6.4316e+15 54099
## - Kilometer           1 2.7204e+14 6.7036e+15 54175
## - Seating.Capacity    1 4.5528e+14 6.8869e+15 54226
## - Fuel.Tank.Capacity  1 4.3528e+15 1.0784e+16 55066
# Resumen del modelo final
summary(model_final2)
## 
## Call:
## lm(formula = Price ~ Kilometer + Seating.Capacity + Fuel.Tank.Capacity, 
##     data = data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -5582265  -732770  -162588   383156 28135994 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -3.925e+04  2.913e+05  -0.135    0.893    
## Kilometer          -6.524e+00  7.336e-01  -8.894   <2e-16 ***
## Seating.Capacity   -6.467e+05  5.621e+04 -11.505   <2e-16 ***
## Fuel.Tank.Capacity  1.059e+05  2.976e+03  35.575   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1855000 on 1870 degrees of freedom
## Multiple R-squared:  0.4166, Adjusted R-squared:  0.4157 
## F-statistic: 445.1 on 3 and 1870 DF,  p-value: < 2.2e-16