#install.packages("readxl")
library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(plotly)
##
## Adjuntando el paquete: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(dslabs)
library(writexl)
library(lmtest)
## Cargando paquete requerido: zoo
##
## Adjuntando el paquete: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(readr)
library(readxl)
Descargamos la base de datos para analisis.
ageandheight <- read_excel("ageandheight.xls",sheet = "Hoja2")
ageandheight
## # A tibble: 12 × 3
## age height no_siblings
## <dbl> <dbl> <dbl>
## 1 18 76.1 0
## 2 19 77 4
## 3 20 78.1 0
## 4 21 78.2 4
## 5 22 78.8 2
## 6 23 79.7 4
## 7 24 7.99 3
## 8 25 81.1 2
## 9 26 81.2 1
## 10 27 81.8 5
## 11 28 82.8 1
## 12 29 83.5 0
Construimos el grafico de dispersion para analizar los datos
#Gráfico de Dispersión
grPointageandheight <- ggplot(ageandheight, aes(x = age, y = height)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "blue") +
labs(title = "Age Vs Height",
x = "Age",
y = "Height") +
ylim(75,max(ageandheight$height)+2) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
grPointageandheightinter<-ggplotly(grPointageandheight)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
grPointageandheightinter
Se calcula el coeficiente de correlacion para los datos, usando la variable cualitativa que los clasifica, en este caso no_siblings. Al verificar los datos, se observa que el agrupador que tiene datos consistentes y una correlacion adecuada.
#Coeficiente de correlación de Pearson
ageandheight %>%
group_by(no_siblings) %>%
summarise(cor(age,height,method = "pearson"))
## # A tibble: 6 × 2
## no_siblings `cor(age, height, method = "pearson")`
## <dbl> <dbl>
## 1 0 0.996
## 2 1 1
## 3 2 1
## 4 3 NA
## 5 4 0.998
## 6 5 NA
Escogemos el grupo de datos con el no_siblings igual a 4, por que el coeficiente de correlacion es cercano a 1 para la cantidad de datos que se descargaron de la base datos.
Nota: el clasificador no_siblings es un valor variable, pues en el excel esta formulado y puede suceder que la correlacion cambie segun el valor aleatorio que tome cada registro.
nosiblings1 <- ageandheight %>%
filter(no_siblings == 4) %>%
mutate(log10Age = log10(age), log10Height = log10(height))
nosiblings1
## # A tibble: 3 × 5
## age height no_siblings log10Age log10Height
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 19 77 4 1.28 1.89
## 2 21 78.2 4 1.32 1.89
## 3 23 79.7 4 1.36 1.90
Ahora, se calcula el modelo de regresion lineal.
#Modelo de regresión lineal
modelo <- lm(log10Height~log10Age,data=nosiblings1)
modelo
##
## Call:
## lm(formula = log10Height ~ log10Age, data = nosiblings1)
##
## Coefficients:
## (Intercept) log10Age
## 1.656 0.180
summary<-summary(modelo)
summary
##
## Call:
## lm(formula = log10Height ~ log10Age, data = nosiblings1)
##
## Residuals:
## 1 2 3
## 0.0003567 -0.0007492 0.0003925
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.65601 0.02067 80.14 0.00794 **
## log10Age 0.17996 0.01564 11.51 0.05519 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.000918 on 1 degrees of freedom
## Multiple R-squared: 0.9925, Adjusted R-squared: 0.985
## F-statistic: 132.4 on 1 and 1 DF, p-value: 0.05519
Se calcula el intercepto y el slope.
intercept <- modelo[["coefficients"]][["(Intercept)"]]
slope <- modelo[["coefficients"]][["log10Age"]]
Graficamos el modelo de regresion lineal
#Gráfico del modelo
grnosiblings1 <- ggplot(nosiblings1,aes(x=nosiblings1$log10Age,y=nosiblings1$log10Height)) +
geom_point() + ggtitle("Regresion linear model") +
geom_smooth(method = "lm",color="blue")+
ylim(1.875,1.925) +
geom_text(aes(label = paste("y = ",round(slope,2),"x + ",round(intercept,2)),x=1.3,y=1.92),color="blue")
grnosiblings1
## Warning: Use of `nosiblings1$log10Age` is discouraged.
## ℹ Use `log10Age` instead.
## Warning: Use of `nosiblings1$log10Height` is discouraged.
## ℹ Use `log10Height` instead.
## Warning: Use of `nosiblings1$log10Age` is discouraged.
## ℹ Use `log10Age` instead.
## Warning: Use of `nosiblings1$log10Height` is discouraged.
## ℹ Use `log10Height` instead.
## Warning in geom_text(aes(label = paste("y = ", round(slope, 2), "x + ", : All aesthetics have length 1, but the data has 3 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
## `geom_smooth()` using formula = 'y ~ x'
Se realizan las pruebas del modelo para su aceptacion.
ei <- residuals(modelo);ei
## 1 2 3
## 0.0003567445 -0.0007492204 0.0003924758
plot(ei)
pred<-sort(fitted(modelo));pred
## 1 2 3
## 1.886134 1.893956 1.901066
#Detección de puntos influyentes
plot(cooks.distance(modelo))
#Validación del modelo
#si pvalor>0.05 acepto la hipótesis nula
t.test(ei)
##
## One Sample t-test
##
## data: ei
## t = 9.6437e-17, df = 2, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.001612428 0.001612428
## sample estimates:
## mean of x
## 3.614007e-20
shapiro.test(ei)
##
## Shapiro-Wilk normality test
##
## data: ei
## W = 0.77345, p-value = 0.05257
#Prueba de Breusch Pagan n=3
bptest(modelo)
##
## studentized Breusch-Pagan test
##
## data: modelo
## BP = 0.020413, df = 1, p-value = 0.8864
#Prueba de Durbin Watson
dwtest(modelo,alternative = "two.sided")
##
## Durbin-Watson test
##
## data: modelo
## DW = 2.9985, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is not 0
Se realiza una prueba de comprobacion del error, tomando un bebe de 36 meses para calcular su posible estatura en cm.
#Predicciones
# edad = 36
round(log10(36),2)
## [1] 1.56
pred1 <- predict(object = modelo, newdata = data.frame(log10Age = log10(36)));pred1
## 1
## 1.936081
total = 10^(pred1);total
## 1
## 86.31397