REGRESION LINEAL MULTIPLE

PAQUETES NECESARIOS PARA REGRESIÓN MULTIPLE

library(ggplot2)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(broom)
library(ggpubr)
library(readr)

Datos

heart.data <- read_csv("heart.data.csv")

## New names:
## Rows: 498 Columns: 4
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," dbl
## (4): ...1, biking, smoking, heart.disease
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`

summary(heart.data)

##       ...1           biking          smoking        heart.disease    
##  Min.   :  1.0   Min.   : 1.119   Min.   : 0.5259   Min.   : 0.5519  
##  1st Qu.:125.2   1st Qu.:20.205   1st Qu.: 8.2798   1st Qu.: 6.5137  
##  Median :249.5   Median :35.824   Median :15.8146   Median :10.3853  
##  Mean   :249.5   Mean   :37.788   Mean   :15.4350   Mean   :10.1745  
##  3rd Qu.:373.8   3rd Qu.:57.853   3rd Qu.:22.5689   3rd Qu.:13.7240  
##  Max.   :498.0   Max.   :74.907   Max.   :29.9467   Max.   :20.4535

LOS DATOS CONSTA DE 498 OBSERVACIONES, LAS CUALES TIENE COMO VARIABLES RESPUESTA HEART.DISEASE, Y COMO VARIABLES REGRESORAS BIKING, SMOKING

Independencia

cor(heart.data)

##                      ...1      biking    smoking heart.disease
## ...1           1.00000000  0.05708763 0.05267393   -0.05172509
## biking         0.05708763  1.00000000 0.01513618   -0.93545547
## smoking        0.05267393  0.01513618 1.00000000    0.30913098
## heart.disease -0.05172509 -0.93545547 0.30913098    1.00000000

cor(heart.data$biking, heart.data$smoking)

## [1] 0.01513618

LA CORRELACIÓN DE LA VARIABLE HEART CON RESPECTO A LAS VARIABLES BIKING Y SMOKING ES NEGATIVA, FUERTE CON UN R DE -0.9354 Y ES POSITIVA, DEBIL CON UN R DE 0.30 RESPECTIVAMENTE

Normalidad

hist(heart.data$heart.disease)

Linealidad

pairs(heart.data)

plot(heart.disease ~ biking, data=heart.data)

plot(heart.disease ~ smoking, data=heart.data)

DE ACUERDO A LA GRAFICA PODEMOS OBSERVAR QUE LOS DATOS PRESENTAN UNA RELACIÓN LINEAL NEGATIVA ENTRE ELLOS, POR TANTO SE CUMPLE EL SUPUESTO DE NORMALIDAS ENTRE LAS VARIABLES

Modelo $$ Y= _o +_1 X_1+ _2 X_2

heart.disease.lm<-lm(heart.disease ~ biking + smoking, data = heart.data) # modelo de regresión 

summary(heart.disease.lm)

## 
## Call:
## lm(formula = heart.disease ~ biking + smoking, data = heart.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1789 -0.4463  0.0362  0.4422  1.9331 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 14.984658   0.080137  186.99   <2e-16 ***
## biking      -0.200133   0.001366 -146.53   <2e-16 ***
## smoking      0.178334   0.003539   50.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.654 on 495 degrees of freedom
## Multiple R-squared:  0.9796, Adjusted R-squared:  0.9795 
## F-statistic: 1.19e+04 on 2 and 495 DF,  p-value: < 2.2e-16

\[ Y=14.984658-0.200133Biking+0.178334Smoking+e_i \] Homocedasticidad

par(mfrow=c(2,2))
plot(heart.disease.lm)

par(mfrow=c(1,1))

grafico del modelo

plotting.data<-expand.grid(
  biking = seq(min(heart.data$biking), max(heart.data$biking), length.out=30),
    smoking=c(min(heart.data$smoking), mean(heart.data$smoking), max(heart.data$smoking)))
#valores predictores
plotting.data$predicted.y <- predict.lm(heart.disease.lm, newdata=plotting.data)
plotting.data$smoking <- round(plotting.data$smoking, digits = 2)
plotting.data$smoking <- as.factor(plotting.data$smoking)
heart.plot <- ggplot(heart.data, aes(x=biking, y=heart.disease)) +
  geom_point()

heart.plot

Linea de tendencia

heart.plot <- heart.plot +
  geom_line(data=plotting.data, aes(x=biking, y=predicted.y, color=smoking), size=1.25)

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

heart.plot

heart.plot <-
heart.plot +
  theme_bw() +
  labs(title = "Rates of heart disease (% of population) \n as a function of biking to work and smoking",
      x = "Biking to work (% of population)",
      y = "Heart disease (% of population)",
      color = "Smoking \n (% of population)")

heart.plot

heart.plot + annotate(geom="text", x=30, y=1.75, label=" Y= 15 + (-0.2*biking) + (0.178*smoking)")

anova(heart.disease.lm)

## Analysis of Variance Table
## 
## Response: heart.disease
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## biking      1 9090.6  9090.6 21251.7 < 2.2e-16 ***
## smoking     1 1086.0  1086.0  2538.8 < 2.2e-16 ***
## Residuals 495  211.7     0.4                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#normalidad 
library(nortest)
residuos=resid(heart.disease.lm)
n=lillie.test(residuos);n

## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  residuos
## D = 0.031511, p-value = 0.2681

$H_o:$ los residuos no son normales prueba de homecedasticidad

library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

bptest(heart.disease.lm)

## 
##  studentized Breusch-Pagan test
## 
## data:  heart.disease.lm
## BP = 5.7775, df = 2, p-value = 0.05564

#Ingresamos valores Regresión ineal simp´le sin base de datos
agregado <-c(rep(1,6),rep(2,6),rep(3,6),rep(4,6),rep(5,6))
humedad<- c(551,457,450,731,499,632,595,580,508,583,633,517,639,615,511,573,648,677,417,449,517,438,415,555,563,631,522,613,656,679)
#veirificamos tamaño de los vectores
length(humedad)

## [1] 30

length(agregado)

## [1] 30

#Generamos matriz de datos            
datos <- data.frame(agregado = agregado, humedad = humedad)
datos

##    agregado humedad
## 1         1     551
## 2         1     457
## 3         1     450
## 4         1     731
## 5         1     499
## 6         1     632
## 7         2     595
## 8         2     580
## 9         2     508
## 10        2     583
## 11        2     633
## 12        2     517
## 13        3     639
## 14        3     615
## 15        3     511
## 16        3     573
## 17        3     648
## 18        3     677
## 19        4     417
## 20        4     449
## 21        4     517
## 22        4     438
## 23        4     415
## 24        4     555
## 25        5     563
## 26        5     631
## 27        5     522
## 28        5     613
## 29        5     656
## 30        5     679

table(datos$agregado)

## 
## 1 2 3 4 5 
## 6 6 6 6 6

summary(datos)

##     agregado    humedad     
##  Min.   :1   Min.   :415.0  
##  1st Qu.:2   1st Qu.:508.8  
##  Median :3   Median :568.0  
##  Mean   :3   Mean   :561.8  
##  3rd Qu.:4   3rd Qu.:631.8  
##  Max.   :5   Max.   :731.0

#Medias por columnas

aggregate(humedad ~ agregado, data = datos, FUN = mean)

##   agregado  humedad
## 1        1 553.3333
## 2        2 569.3333
## 3        3 610.5000
## 4        4 465.1667
## 5        5 610.6667

#Varianza por columnas

aggregate(humedad ~ agregado, data = datos, FUN = sd)

##   agregado   humedad
## 1        1 110.15383
## 2        2  47.98611
## 3        3  59.94581
## 4        4  57.60700
## 5        5  58.78322

## Diagramas de box-plot
require(ggplot2)

ggplot(data = datos, aes(x = factor(agregado), y = humedad, color = factor(agregado)) )+
  geom_boxplot() +
  theme_bw()

NORMALIDAD ANOVA

#Forma gráfica
par(mfrow = c(3,2))
qqnorm(datos[datos$agregado == 1,"humedad"], main = 1)
qqline(datos[datos$agregado == 1,"humedad"])
qqnorm(datos[datos$agregado == 2,"humedad"], main = 2)
qqline(datos[datos$agregado == 2,"humedad"])
qqnorm(datos[datos$agregado == 3,"humedad"], main = 3)
qqline(datos[datos$agregado == 3,"humedad"])
qqnorm(datos[datos$agregado == 4,"humedad"], main = 4)
qqline(datos[datos$agregado == 4,"humedad"])
qqnorm(datos[datos$agregado == 5,"humedad"], main = 5)
qqline(datos[datos$agregado == 5,"humedad"])

#install.packages("nortest")
require(nortest)
by(data = datos,INDICES = factor(datos$agregado),FUN = function(x){ lillie.test(x$humedad)})

## factor(datos$agregado): 1
## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  x$humedad
## D = 0.18908, p-value = 0.7193
## 
## ------------------------------------------------------------ 
## factor(datos$agregado): 2
## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  x$humedad
## D = 0.25462, p-value = 0.2576
## 
## ------------------------------------------------------------ 
## factor(datos$agregado): 3
## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  x$humedad
## D = 0.19659, p-value = 0.6621
## 
## ------------------------------------------------------------ 
## factor(datos$agregado): 4
## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  x$humedad
## D = 0.27717, p-value = 0.1578
## 
## ------------------------------------------------------------ 
## factor(datos$agregado): 5
## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  x$humedad
## D = 0.1825, p-value = 0.7676

"Los test de hipótesis no muestran evidencias de falta de normalidad."

## [1] "Los test de hipótesis no muestran evidencias de falta de normalidad."

Homocedasticidad

bartlett.test(humedad ~ agregado,datos)

## 
##  Bartlett test of homogeneity of variances
## 
## data:  humedad by agregado
## Bartlett's K-squared = 4.4406, df = 4, p-value = 0.3497

fligner.test(humedad ~ agregado,datos)

## 
##  Fligner-Killeen test of homogeneity of variances
## 
## data:  humedad by agregado
## Fligner-Killeen:med chi-squared = 3.974, df = 4, p-value = 0.4095

ANOVA

anova <- aov(datos$humedad ~ factor(datos$agregado))
summary(anova)

##                        Df Sum Sq Mean Sq F value  Pr(>F)   
## factor(datos$agregado)  4  85356   21339   4.302 0.00875 **
## Residuals              25 124020    4961                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(anova)

#comparaciones múltiples(no es necesario para el trabajo)
#pairwise.t.test(x = datos$humedad, g = factor(datos$agregado), p.adjust.method = "holm",
 #               pool.sd = TRUE, paired = FALSE, alternative = "two.sided")

#TukeyHSD(anova)



#plot(TukeyHSD(anova))

REGRESION LINEAL MULTIPLE

ADALIS HERRERA

2025-11-06