20240815-Clase

Cargar librerías

##install.packages("writexl")
##install.packages("corrplot")

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(dslabs)
library(writexl)
library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(corrplot)

## corrplot 0.92 loaded

data("murders")

Gráfico de dispersión

murders %>% ggplot(aes(x=log10(population/10^6), y=log10(total),
    color=region,shape=region))+
  geom_point(show.legend=FALSE)+
  facet_wrap(~region,nrow=2)

Coeficiente de correlación de Pearson

murders %>% group_by(region) %>%
  summarise(cor(log10(population/10^6),log10(total),
  method = "pearson"))

## # A tibble: 4 × 2
##   region        `cor(log10(population/10^6), log10(total), method = "pearson")`
##   <fct>                                                                   <dbl>
## 1 Northeast                                                               0.970
## 2 South                                                                   0.852
## 3 North Central                                                           0.949
## 4 West                                                                    0.915

Crear el data frame filtrando por región

Crear nueva columna log10pop

northeast <- murders %>% filter(region=="Northeast") %>%
    mutate(log10pop=log10(population/10^6),log10tot=log10(total)) %>%
    select(population,log10pop,total,log10tot)

## Exportar archivo en formato xlsx
## Previamente instalar paquete y cargar librería writexl
## write_xlsx(northeast,"Datos_Re_Lin.xlsx")

Modelo de regression lineal

modelo <- northeast %>% select(log10pop,log10tot) %>%
  lm(log10tot~log10pop,data=.)
  summary <- summary(modelo)

intercept <- modelo[["coefficients"]][["(Intercept)"]]
slope <- modelo[["coefficients"]][["log10pop"]]

Gráfico del modelo

northeast %>% ggplot(aes(x=log10pop,y=log10tot)) +
  geom_point() + ggtitle("Regresion linear model") +
  geom_smooth(method = "lm",color="blue")+
  geom_text(aes(label = 
        paste("y = ",round(slope,2),"x+",round(intercept,2)),x=0.1,y=3),color="blue")

## Warning in geom_text(aes(label = paste("y = ", round(slope, 2), "x+", round(intercept, : All aesthetics have length 1, but the data has 9 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

## `geom_smooth()` using formula = 'y ~ x'

Análisis de los residuales

ei <- residuals(modelo); ei

##           1           2           3           4           5           6 
##  0.27453592  0.01986875 -0.06282747 -0.31627887  0.05054933 -0.17841806 
##           7           8           9 
##  0.06274895  0.34499620 -0.19517475

plot(ei)

pred <- sort(fitted(modelo));pred

##         9         8         4         2         1         3         5         7 
## 0.4962047 0.8591238 1.0152489 1.0215239 1.7122358 2.1347095 2.3403858 2.5971672 
##         6 
## 2.8919086

Detección de puntos influyentes

cooks.distance(modelo)

##           1           2           3           4           5           6 
## 0.099145728 0.001011869 0.007440599 0.259238013 0.006705056 0.264446797 
##           7           8           9 
## 0.016916749 0.412017763 0.287763997

Validación del modelo

Ho = Los errores tienen media cero

Ha = Los errores NO tienen media cero

Si p-value >0.05 acepto la hipótesis nula

Ho = Los errores tienen media cero Ha = Los errores NO tienen media cero Si p-value >0.05 acepto la hipótesis nula

t.test(ei)

## 
##  One Sample t-test
## 
## data:  ei
## t = -1.3876e-16, df = 8, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.1664627  0.1664627
## sample estimates:
##     mean of x 
## -1.001682e-17

Prueba Shapiro Wilk para n<30

Ho = Los errores tienen una distribución normal Ha = Los errores NO tienen una distribución normal Si p-value >0.05 acepto la hipótesis nula

shapiro.test(ei)

## 
##  Shapiro-Wilk normality test
## 
## data:  ei
## W = 0.96004, p-value = 0.7988

## p-value=1 acepto la Ho

Prueba de varianza

Ho = Los varianza de los errores es constante Ha = Los varianza de los errores NO es constante Si p-value >0.05 acepto la hipótesis nula

bptest(modelo)

## 
##  studentized Breusch-Pagan test
## 
## data:  modelo
## BP = 2.3144, df = 1, p-value = 0.1282

##p-value=0.1282 acepto la Ho

Prueba de independencia

Ho = Los errores son independientes Ha = Los errores NO son independientes Si p-value >0.05 acepto la hipótesis nula

dwtest(modelo,alternative="two.sided")

## 
##  Durbin-Watson test
## 
## data:  modelo
## DW = 2.0058, p-value = 0.864
## alternative hypothesis: true autocorrelation is not 0

##p-value=0.864 acepto la Ho

Predicciones

pred1 <- predict(object = modelo, newdata = data.frame(
      log10pop=log10(20000000/10^6)));pred1

##        1 
## 2.913953

total=10^(pred1);total

##        1 
## 820.2624

20240815-Clase_10

Andrea Zabala Quimbayo

2024-08-16