PAQUETES NECESARIOS PARA REGRESIĂN MULTIPLE
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(broom)
library(ggpubr)
library(readr)
Datos
heart.data <- read_csv("heart.data.csv")
## New names:
## Rows: 498 Columns: 4
## ââ Column specification
## ââââââââââââââââââââââââââââââââââââââââââââââââââââââââ Delimiter: "," dbl
## (4): ...1, biking, smoking, heart.disease
## âč Use `spec()` to retrieve the full column specification for this data. âč
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## âą `` -> `...1`
summary(heart.data)
## ...1 biking smoking heart.disease
## Min. : 1.0 Min. : 1.119 Min. : 0.5259 Min. : 0.5519
## 1st Qu.:125.2 1st Qu.:20.205 1st Qu.: 8.2798 1st Qu.: 6.5137
## Median :249.5 Median :35.824 Median :15.8146 Median :10.3853
## Mean :249.5 Mean :37.788 Mean :15.4350 Mean :10.1745
## 3rd Qu.:373.8 3rd Qu.:57.853 3rd Qu.:22.5689 3rd Qu.:13.7240
## Max. :498.0 Max. :74.907 Max. :29.9467 Max. :20.4535
LOS DATOS CONSTA DE 498 OBSERVACIONES, LAS CUALES TIENE COMO VARIABLES RESPUESTA HEART.DISEASE, Y COMO VARIABLES REGRESORAS BIKING, SMOKING
Independencia
cor(heart.data)
## ...1 biking smoking heart.disease
## ...1 1.00000000 0.05708763 0.05267393 -0.05172509
## biking 0.05708763 1.00000000 0.01513618 -0.93545547
## smoking 0.05267393 0.01513618 1.00000000 0.30913098
## heart.disease -0.05172509 -0.93545547 0.30913098 1.00000000
cor(heart.data$biking, heart.data$smoking)
## [1] 0.01513618
LA CORRELACIĂN DE LA VARIABLE HEART CON RESPECTO A LAS VARIABLES BIKING Y SMOKING ES NEGATIVA, FUERTE CON UN R DE -0.9354 Y ES POSITIVA, DEBIL CON UN R DE 0.30 RESPECTIVAMENTE
Normalidad
hist(heart.data$heart.disease)
Linealidad
pairs(heart.data)
plot(heart.disease ~ biking, data=heart.data)
plot(heart.disease ~ smoking, data=heart.data)
DE ACUERDO A LA GRAFICA PODEMOS OBSERVAR QUE LOS DATOS PRESENTAN UNA
RELACIĂN LINEAL NEGATIVA ENTRE ELLOS, POR TANTO SE CUMPLE EL SUPUESTO DE
NORMALIDAS ENTRE LAS VARIABLES
Modelo $$ Y= _o +_1 X_1+ _2 X_2
$$
heart.disease.lm<-lm(heart.disease ~ biking + smoking, data = heart.data) # modelo de regresiĂłn
summary(heart.disease.lm)
##
## Call:
## lm(formula = heart.disease ~ biking + smoking, data = heart.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.1789 -0.4463 0.0362 0.4422 1.9331
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.984658 0.080137 186.99 <2e-16 ***
## biking -0.200133 0.001366 -146.53 <2e-16 ***
## smoking 0.178334 0.003539 50.39 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.654 on 495 degrees of freedom
## Multiple R-squared: 0.9796, Adjusted R-squared: 0.9795
## F-statistic: 1.19e+04 on 2 and 495 DF, p-value: < 2.2e-16
\[ Y=14.984658-0.200133Biking+0.178334Smoking+e_i \] Homocedasticidad
par(mfrow=c(2,2))
plot(heart.disease.lm)
par(mfrow=c(1,1))
grafico del modelo
plotting.data<-expand.grid(
biking = seq(min(heart.data$biking), max(heart.data$biking), length.out=30),
smoking=c(min(heart.data$smoking), mean(heart.data$smoking), max(heart.data$smoking)))
#valores predictores
plotting.data$predicted.y <- predict.lm(heart.disease.lm, newdata=plotting.data)
plotting.data$smoking <- round(plotting.data$smoking, digits = 2)
plotting.data$smoking <- as.factor(plotting.data$smoking)
heart.plot <- ggplot(heart.data, aes(x=biking, y=heart.disease)) +
geom_point()
heart.plot
Linea de tendencia
heart.plot <- heart.plot +
geom_line(data=plotting.data, aes(x=biking, y=predicted.y, color=smoking), size=1.25)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## âč Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
heart.plot
heart.plot <-
heart.plot +
theme_bw() +
labs(title = "Rates of heart disease (% of population) \n as a function of biking to work and smoking",
x = "Biking to work (% of population)",
y = "Heart disease (% of population)",
color = "Smoking \n (% of population)")
heart.plot
heart.plot + annotate(geom="text", x=30, y=1.75, label=" Y= 15 + (-0.2*biking) + (0.178*smoking)")
anova(heart.disease.lm)
## Analysis of Variance Table
##
## Response: heart.disease
## Df Sum Sq Mean Sq F value Pr(>F)
## biking 1 9090.6 9090.6 21251.7 < 2.2e-16 ***
## smoking 1 1086.0 1086.0 2538.8 < 2.2e-16 ***
## Residuals 495 211.7 0.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#normalidad
library(nortest)
residuos=resid(heart.disease.lm)
n=lillie.test(residuos);n
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: residuos
## D = 0.031511, p-value = 0.2681
\(H_o:\) los residuos no son normales prueba de homecedasticidad
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(heart.disease.lm)
##
## studentized Breusch-Pagan test
##
## data: heart.disease.lm
## BP = 5.7775, df = 2, p-value = 0.05564
#Ingresamos valores RegresiĂłn ineal simpÂŽle sin base de datos
agregado <-c(rep(1,6),rep(2,6),rep(3,6),rep(4,6),rep(5,6))
humedad<- c(551,457,450,731,499,632,595,580,508,583,633,517,639,615,511,573,648,677,417,449,517,438,415,555,563,631,522,613,656,679)
#veirificamos tamaño de los vectores
length(humedad)
## [1] 30
length(agregado)
## [1] 30
#Generamos matriz de datos
datos <- data.frame(agregado = agregado, humedad = humedad)
datos
## agregado humedad
## 1 1 551
## 2 1 457
## 3 1 450
## 4 1 731
## 5 1 499
## 6 1 632
## 7 2 595
## 8 2 580
## 9 2 508
## 10 2 583
## 11 2 633
## 12 2 517
## 13 3 639
## 14 3 615
## 15 3 511
## 16 3 573
## 17 3 648
## 18 3 677
## 19 4 417
## 20 4 449
## 21 4 517
## 22 4 438
## 23 4 415
## 24 4 555
## 25 5 563
## 26 5 631
## 27 5 522
## 28 5 613
## 29 5 656
## 30 5 679
table(datos$agregado)
##
## 1 2 3 4 5
## 6 6 6 6 6
summary(datos)
## agregado humedad
## Min. :1 Min. :415.0
## 1st Qu.:2 1st Qu.:508.8
## Median :3 Median :568.0
## Mean :3 Mean :561.8
## 3rd Qu.:4 3rd Qu.:631.8
## Max. :5 Max. :731.0
#Medias por columnas
aggregate(humedad ~ agregado, data = datos, FUN = mean)
## agregado humedad
## 1 1 553.3333
## 2 2 569.3333
## 3 3 610.5000
## 4 4 465.1667
## 5 5 610.6667
#Varianza por columnas
aggregate(humedad ~ agregado, data = datos, FUN = sd)
## agregado humedad
## 1 1 110.15383
## 2 2 47.98611
## 3 3 59.94581
## 4 4 57.60700
## 5 5 58.78322
## Diagramas de box-plot
require(ggplot2)
ggplot(data = datos, aes(x = factor(agregado), y = humedad, color = factor(agregado)) )+
geom_boxplot() +
theme_bw()
NORMALIDAD ANOVA
#Forma grĂĄfica
par(mfrow = c(3,2))
qqnorm(datos[datos$agregado == 1,"humedad"], main = 1)
qqline(datos[datos$agregado == 1,"humedad"])
qqnorm(datos[datos$agregado == 2,"humedad"], main = 2)
qqline(datos[datos$agregado == 2,"humedad"])
qqnorm(datos[datos$agregado == 3,"humedad"], main = 3)
qqline(datos[datos$agregado == 3,"humedad"])
qqnorm(datos[datos$agregado == 4,"humedad"], main = 4)
qqline(datos[datos$agregado == 4,"humedad"])
qqnorm(datos[datos$agregado == 5,"humedad"], main = 5)
qqline(datos[datos$agregado == 5,"humedad"])
#install.packages("nortest")
require(nortest)
by(data = datos,INDICES = factor(datos$agregado),FUN = function(x){ lillie.test(x$humedad)})
## factor(datos$agregado): 1
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: x$humedad
## D = 0.18908, p-value = 0.7193
##
## ------------------------------------------------------------
## factor(datos$agregado): 2
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: x$humedad
## D = 0.25462, p-value = 0.2576
##
## ------------------------------------------------------------
## factor(datos$agregado): 3
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: x$humedad
## D = 0.19659, p-value = 0.6621
##
## ------------------------------------------------------------
## factor(datos$agregado): 4
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: x$humedad
## D = 0.27717, p-value = 0.1578
##
## ------------------------------------------------------------
## factor(datos$agregado): 5
##
## Lilliefors (Kolmogorov-Smirnov) normality test
##
## data: x$humedad
## D = 0.1825, p-value = 0.7676
"Los test de hipĂłtesis no muestran evidencias de falta de normalidad."
## [1] "Los test de hipĂłtesis no muestran evidencias de falta de normalidad."
Homocedasticidad
bartlett.test(humedad ~ agregado,datos)
##
## Bartlett test of homogeneity of variances
##
## data: humedad by agregado
## Bartlett's K-squared = 4.4406, df = 4, p-value = 0.3497
fligner.test(humedad ~ agregado,datos)
##
## Fligner-Killeen test of homogeneity of variances
##
## data: humedad by agregado
## Fligner-Killeen:med chi-squared = 3.974, df = 4, p-value = 0.4095
ANOVA
anova <- aov(datos$humedad ~ factor(datos$agregado))
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## factor(datos$agregado) 4 85356 21339 4.302 0.00875 **
## Residuals 25 124020 4961
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
plot(anova)
#comparaciones mĂșltiples(no es necesario para el trabajo)
#pairwise.t.test(x = datos$humedad, g = factor(datos$agregado), p.adjust.method = "holm",
# pool.sd = TRUE, paired = FALSE, alternative = "two.sided")
#TukeyHSD(anova)
#plot(TukeyHSD(anova))