library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(dslabs)
data("murders")
#Gráfico de dispersión
murders %>% ggplot(aes(x=log10(population/10^6),y=log10(total),color=region,shape=region))+
geom_point(show.legend=FALSE)+facet_wrap(~region,nrow=2)
#Coeficitente de correlación de Pearson
murders %>% group_by(region) %>%
summarise(cor(log10(population/10^6),log10(total),method = "pearson"))
## # A tibble: 4 × 2
## region `cor(log10(population/10^6), log10(total), method = "pearson")`
## <fct> <dbl>
## 1 Northeast 0.970
## 2 South 0.852
## 3 North Central 0.949
## 4 West 0.915
northeast<-murders %>%filter(region=="Northeast") %>%
mutate(log10pop=log10(population/10^6),log10tot=log10(total)) %>%
select(population,log10pop,total,log10tot)
northeast
## population log10pop total log10tot
## 1 3574097 0.55316633 97 1.986772
## 2 1328361 0.12331612 11 1.041393
## 3 6547629 0.81608406 118 2.071882
## 4 1316470 0.11941097 5 0.698970
## 5 8791894 0.94408244 246 2.390935
## 6 19378102 1.28731124 517 2.713491
## 7 12702379 1.10388507 457 2.659916
## 8 1052567 0.02224975 16 1.204120
## 9 625741 -0.20360539 2 0.301030
#install.packages("writexl")
library(writexl)
write_xlsx(northeast,"Datos_Reg_Lin.xlsx")
#Modelo de regresión lineal
modelo<-northeast %>% select(log10pop,log10tot) %>%
lm(log10tot~log10pop,data=.)
summary<-summary(modelo)
summary
##
## Call:
## lm(formula = log10tot ~ log10pop, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.31628 -0.17842 0.01987 0.06275 0.34500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.8234 0.1119 7.356 0.000155 ***
## log10pop 1.6069 0.1531 10.496 1.55e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2315 on 7 degrees of freedom
## Multiple R-squared: 0.9403, Adjusted R-squared: 0.9317
## F-statistic: 110.2 on 1 and 7 DF, p-value: 1.554e-05
intercept <- modelo[["coefficients"]][["(Intercept)"]]
slope <-modelo[["coefficients"]][["log10pop"]]
La mediana es cercana a 0, es decir, el 50% de los datos esta debajo de este valor, el error.
P(0.00015)<0.05, es significativo *** Multiple R, si se acerca a 1 es un modelo que se ajusta bien
# ¿Cómo interpretar los coeficientes?
# ¿El modelo es significativo?
#¿Cómo es el ajuste del modelo?
#Gráfico del modelo
#Gráfico del modelo
northeast %>% ggplot(aes(x=log10pop,y=log10tot)) +
geom_point() + ggtitle("Regresion linear model") +
geom_smooth(method = "lm",color="blue")+geom_text(aes(label =
paste("y = ",round(slope,2),"x + ",round(intercept,2)),x=0.1,y=3), color="blue")
## Warning in geom_text(aes(label = paste("y = ", round(slope, 2), "x + ", : All aesthetics have length 1, but the data has 9 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
## `geom_smooth()` using formula = 'y ~ x'
#Análisis de los residuales
ei<-residuals(modelo);ei
## 1 2 3 4 5 6
## 0.27453592 0.01986875 -0.06282747 -0.31627887 0.05054933 -0.17841806
## 7 8 9
## 0.06274895 0.34499620 -0.19517475
plot(ei)
pred<-fitted(modelo); pred
## 1 2 3 4 5 6 7 8
## 1.7122358 1.0215239 2.1347095 1.0152489 2.3403858 2.8919086 2.5971672 0.8591238
## 9
## 0.4962047
sort(pred)
## 9 8 4 2 1 3 5 7
## 0.4962047 0.8591238 1.0152489 1.0215239 1.7122358 2.1347095 2.3403858 2.5971672
## 6
## 2.8919086
# Detección de puntos influyentes
plot(cooks.distance(modelo))
No hay datos atipicos, porque no hay un punto con una distancia muy alejada
# Validación del modelo
#¿Los errores tienen media cero?
#Ho = Los errores tienen media cero
#Ha = Los errores NO tienen media cero
t.test(ei)
##
## One Sample t-test
##
## data: ei
## t = -1.3876e-16, df = 8, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.1664627 0.1664627
## sample estimates:
## mean of x
## -1.001682e-17
#Si pvalor>0.05 acepto la hipótesis nula
#n<30 Prueba shapiro
#n>30 Anderson Darling
# ¿Los errores tienen distribución normal?
#Ho = Los errores tienen una distribución normal
#Ha = Los errores NO tienen una distribución normal
shapiro.test(ei)
##
## Shapiro-Wilk normality test
##
## data: ei
## W = 0.96004, p-value = 0.7988
#Si pvalor>0.05 acepto la hipótesis nula
#install.packages("lmtest")
library(lmtest)
## Cargando paquete requerido: zoo
##
## Adjuntando el paquete: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
#¿La varianza es constante?
#Ho = La varianza de los errores es constante
#Ha = La varianza de los errores NO es constante
#Prueba de Breush Pagan n=9
bptest(modelo)
##
## studentized Breusch-Pagan test
##
## data: modelo
## BP = 2.3144, df = 1, p-value = 0.1282
#Si pvalor>0.05 acepto la hipótesis nula
#¿Los errores son independientes?
#Ho = Los errores son independientes
#Ha = Los errores NO son independientes
#Si pvalor>0.05 acepto la hipótesis nula
#Prueba de Durbin Watson
dwtest(modelo,alternative="two.sided")
##
## Durbin-Watson test
##
## data: modelo
## DW = 2.0058, p-value = 0.864
## alternative hypothesis: true autocorrelation is not 0
EL MODELO ES VÁLIDO
#Preddicciones
#population = 20000000
round(log10(20000000/10^6),2)
## [1] 1.3
pred1 <- predict(object = modelo, newdata=data.frame(log10pop=log10(20000000/10^6)));pred1
## 1
## 2.913953
total=10^(pred1);total
## 1
## 820.2624