#install.packages("readxl")
library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(plotly)
## 
## Adjuntando el paquete: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(dslabs)
library(writexl)
library(lmtest)
## Cargando paquete requerido: zoo
## 
## Adjuntando el paquete: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(readr)
library(readxl)

Descargamos la base de datos para analisis.

ageandheight <- read_excel("ageandheight.xls",sheet = "Hoja2")
ageandheight
## # A tibble: 12 × 3
##      age height no_siblings
##    <dbl>  <dbl>       <dbl>
##  1    18  76.1            0
##  2    19  77              4
##  3    20  78.1            0
##  4    21  78.2            4
##  5    22  78.8            2
##  6    23  79.7            4
##  7    24   7.99           3
##  8    25  81.1            2
##  9    26  81.2            1
## 10    27  81.8            5
## 11    28  82.8            1
## 12    29  83.5            0

Construimos el grafico de dispersion para analizar los datos

#Gráfico de Dispersión
grPointageandheight <- ggplot(ageandheight, aes(x = age, y = height)) +
                        geom_point() +
                        geom_smooth(method = "lm", se = FALSE, color = "blue") +
                        labs(title = "Age Vs Height",
                             x = "Age",
                             y = "Height") +
                        ylim(75,max(ageandheight$height)+2) +
                        theme_minimal() +
                        theme(plot.title = element_text(hjust = 0.5))

grPointageandheightinter<-ggplotly(grPointageandheight)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).
grPointageandheightinter

Se calcula el coeficiente de correlacion para los datos, usando la variable cualitativa que los clasifica, en este caso no_siblings. Al verificar los datos, se observa que el agrupador que tiene datos consistentes y una correlacion adecuada.

#Coeficiente de correlación de Pearson
ageandheight %>%
  group_by(no_siblings) %>%
  summarise(cor(age,height,method = "pearson"))
## # A tibble: 6 × 2
##   no_siblings `cor(age, height, method = "pearson")`
##         <dbl>                                  <dbl>
## 1           0                                  0.996
## 2           1                                  1    
## 3           2                                  1    
## 4           3                                 NA    
## 5           4                                  0.998
## 6           5                                 NA

Escogemos el grupo de datos con el no_siblings igual a 4, por que el coeficiente de correlacion es cercano a 1 para la cantidad de datos que se descargaron de la base datos.

Nota: el clasificador no_siblings es un valor variable, pues en el excel esta formulado y puede suceder que la correlacion cambie segun el valor aleatorio que tome cada registro.

nosiblings1 <- ageandheight %>%
                filter(no_siblings == 4) %>%
                mutate(log10Age = log10(age), log10Height = log10(height))

nosiblings1
## # A tibble: 3 × 5
##     age height no_siblings log10Age log10Height
##   <dbl>  <dbl>       <dbl>    <dbl>       <dbl>
## 1    19   77             4     1.28        1.89
## 2    21   78.2           4     1.32        1.89
## 3    23   79.7           4     1.36        1.90

Ahora, se calcula el modelo de regresion lineal.

#Modelo de regresión lineal
modelo <- lm(log10Height~log10Age,data=nosiblings1)
modelo
## 
## Call:
## lm(formula = log10Height ~ log10Age, data = nosiblings1)
## 
## Coefficients:
## (Intercept)     log10Age  
##       1.656        0.180
summary<-summary(modelo)
summary
## 
## Call:
## lm(formula = log10Height ~ log10Age, data = nosiblings1)
## 
## Residuals:
##          1          2          3 
##  0.0003567 -0.0007492  0.0003925 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  1.65601    0.02067   80.14  0.00794 **
## log10Age     0.17996    0.01564   11.51  0.05519 . 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.000918 on 1 degrees of freedom
## Multiple R-squared:  0.9925, Adjusted R-squared:  0.985 
## F-statistic: 132.4 on 1 and 1 DF,  p-value: 0.05519

Se calcula el intercepto y el slope.

intercept <- modelo[["coefficients"]][["(Intercept)"]]
slope <- modelo[["coefficients"]][["log10Age"]]

Graficamos el modelo de regresion lineal

#Gráfico del modelo
grnosiblings1 <- ggplot(nosiblings1,aes(x=nosiblings1$log10Age,y=nosiblings1$log10Height)) +
                  geom_point() + ggtitle("Regresion linear model") +
                  geom_smooth(method = "lm",color="blue")+
                  ylim(1.875,1.925) +
                  geom_text(aes(label = paste("y = ",round(slope,2),"x + ",round(intercept,2)),x=1.3,y=1.92),color="blue")

grnosiblings1
## Warning: Use of `nosiblings1$log10Age` is discouraged.
## ℹ Use `log10Age` instead.
## Warning: Use of `nosiblings1$log10Height` is discouraged.
## ℹ Use `log10Height` instead.
## Warning: Use of `nosiblings1$log10Age` is discouraged.
## ℹ Use `log10Age` instead.
## Warning: Use of `nosiblings1$log10Height` is discouraged.
## ℹ Use `log10Height` instead.
## Warning in geom_text(aes(label = paste("y = ", round(slope, 2), "x + ", : All aesthetics have length 1, but the data has 3 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.
## `geom_smooth()` using formula = 'y ~ x'

Se realizan las pruebas del modelo para su aceptacion.

ei <- residuals(modelo);ei
##             1             2             3 
##  0.0003567445 -0.0007492204  0.0003924758
plot(ei)

pred<-sort(fitted(modelo));pred
##        1        2        3 
## 1.886134 1.893956 1.901066
#Detección de puntos influyentes
plot(cooks.distance(modelo))

#Validación del modelo
#si pvalor>0.05 acepto la hipótesis nula

t.test(ei)
## 
##  One Sample t-test
## 
## data:  ei
## t = 9.6437e-17, df = 2, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.001612428  0.001612428
## sample estimates:
##    mean of x 
## 3.614007e-20
shapiro.test(ei)
## 
##  Shapiro-Wilk normality test
## 
## data:  ei
## W = 0.77345, p-value = 0.05257
#Prueba de Breusch Pagan n=3
bptest(modelo)
## 
##  studentized Breusch-Pagan test
## 
## data:  modelo
## BP = 0.020413, df = 1, p-value = 0.8864
#Prueba de Durbin Watson
dwtest(modelo,alternative = "two.sided")
## 
##  Durbin-Watson test
## 
## data:  modelo
## DW = 2.9985, p-value < 2.2e-16
## alternative hypothesis: true autocorrelation is not 0

Se realiza una prueba de comprobacion del error, tomando un bebe de 36 meses para calcular su posible estatura en cm.

#Predicciones
# edad = 36
round(log10(36),2)
## [1] 1.56
pred1 <- predict(object = modelo, newdata = data.frame(log10Age = log10(36)));pred1
##        1 
## 1.936081
total = 10^(pred1);total
##        1 
## 86.31397