Librerias

library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.1
library(dslabs)
## Warning: package 'dslabs' was built under R version 4.4.1
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.4.1
## Cargando paquete requerido: zoo
## Warning: package 'zoo' was built under R version 4.4.1
## 
## Adjuntando el paquete: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.1
## corrplot 0.92 loaded
library(reshape2)
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.1

Dataset

ageandheight <- read_excel("E:/2023_Capacitaciones/BIG DATA UAO/ageandheight.xls")

ageandheight %>% ggplot(
                    aes(
                        x = height, # log10(height/10^6), 
                        y = age    # log10(age) 
                        )
                    ) +
            geom_point(show.legend = FALSE)  +
            xlab("height") +
            ylab("Edad") 

# Coeficiente de correlacion de pearson

ageandheight %>% 
  summarise(cor(height , age, method = "pearson" ))
## # A tibble: 1 × 1
##   `cor(height, age, method = "pearson")`
##                                    <dbl>
## 1                                  0.994
correlation_matrix <- cor(ageandheight[1:2], use = "complete.obs")


# Crear el gráfico
corrplot(correlation_matrix, method="number", type="upper")

modeloh <- ageandheight %>% select(height,age) %>%
                lm( ageandheight$height~ ageandheight$age, data =.)

summary <- summary(modeloh)
summary
## 
## Call:
## lm(formula = ageandheight$height ~ ageandheight$age, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.27238 -0.24248 -0.02762  0.16014  0.47238 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       64.9283     0.5084  127.71  < 2e-16 ***
## ageandheight$age   0.6350     0.0214   29.66 4.43e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.256 on 10 degrees of freedom
## Multiple R-squared:  0.9888, Adjusted R-squared:  0.9876 
## F-statistic:   880 on 1 and 10 DF,  p-value: 4.428e-11
intercepto <- modeloh[["coefficients"]][["(Intercept)"]]
slope <- modeloh[["coefficients"]][["ageandheight$age"]]




ggplot(ageandheight, aes(x= height, y=age))+
              geom_point() +
              ggtitle( "Regresion linear model") + 
              geom_smooth(method = "lm", color="blue") +
              geom_text(aes(x = min(ageandheight$height), 
                            y = max(ageandheight$age),
                            label = paste("y = ", round(slope, 2), "x +", round(intercepto, 2))), 
                        color = "blue", hjust = 0, vjust = 1)
## Warning: Use of `ageandheight$height` is discouraged.
## ℹ Use `height` instead.
## Warning: Use of `ageandheight$age` is discouraged.
## ℹ Use `age` instead.
## Warning in geom_text(aes(x = min(ageandheight$height), y = max(ageandheight$age), : All aesthetics have length 1, but the data has 12 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.
## `geom_smooth()` using formula = 'y ~ x'

## Analisis De Residuales

ei <- residuals(modeloh);ei
##            1            2            3            4            5            6 
## -0.257692308  0.007342657  0.472377622 -0.062587413 -0.097552448  0.167482517 
##            7            8            9           10           11           12 
## -0.267482517  0.297552448 -0.237412587 -0.272377622  0.092657343  0.157692308
pred<- sort(fitted(modeloh));pred
##        1        2        3        4        5        6        7        8 
## 76.35769 76.99266 77.62762 78.26259 78.89755 79.53252 80.16748 80.80245 
##        9       10       11       12 
## 81.43741 82.07238 82.70734 83.34231

Detencion De Punto Influyentes

plot(cooks.distance(modeloh))

Validacion Del Modelo

H0 = los errores tiene media cero

HA = los errores no tiene media cero

Si el p-valor es mayor que el nivel de significancia se acepta la hipotesis nula Pvalor > 0,05

t.test(ei)
## 
##  One Sample t-test
## 
## data:  ei
## t = -1.5599e-16, df = 11, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.1550632  0.1550632
## sample estimates:
##     mean of x 
## -1.098997e-17

H0 = los errores tienen una distribucion normal

HA = los errores no tienen una distribucion normal

Si el pe valor es mayor que el nivel de significancia se acepta la hipotesis nula P-valor > 0,05

shapiro.test(ei)
## 
##  Shapiro-Wilk normality test
## 
## data:  ei
## W = 0.9234, p-value = 0.3154

H0 = la varianza de los errores es constante

HA = la varianza de los errores no es constante

Si el pe valor es mayor que el nivel de significancia se acepta la hipotesis nula Pvalor > 0,05

bptest(modeloh)
## 
##  studentized Breusch-Pagan test
## 
## data:  modeloh
## BP = 0.39003, df = 1, p-value = 0.5323

H0 = los errores son independientes

HA = los errores no son independientes

Si el p-valor es mayor que el nivel de significancia se acepta la hipotesis nula P-valor > 0,05

dwtest(modeloh, alternative = "two.sided")
## 
##  Durbin-Watson test
## 
## data:  modeloh
## DW = 2.4078, p-value = 0.7003
## alternative hypothesis: true autocorrelation is not 0

Simulaciones

fun <- function(var){
  
  pred1 <- predict(object = modeloh, newdata = data.frame(
    height = var))
  total= (pred1)
  options(scipen = 999)
  print(paste("Valor a evaluar ", var ," prediccion ", total  ))
  
}

fun(22)
## Warning: 'newdata' had 1 row but variables found have 12 rows
##  [1] "Valor a evaluar  22  prediccion  76.3576923076923"
##  [2] "Valor a evaluar  22  prediccion  76.9926573426574"
##  [3] "Valor a evaluar  22  prediccion  77.6276223776224"
##  [4] "Valor a evaluar  22  prediccion  78.2625874125874"
##  [5] "Valor a evaluar  22  prediccion  78.8975524475525"
##  [6] "Valor a evaluar  22  prediccion  79.5325174825175"
##  [7] "Valor a evaluar  22  prediccion  80.1674825174825"
##  [8] "Valor a evaluar  22  prediccion  80.8024475524476"
##  [9] "Valor a evaluar  22  prediccion  81.4374125874126"
## [10] "Valor a evaluar  22  prediccion  82.0723776223776"
## [11] "Valor a evaluar  22  prediccion  82.7073426573427"
## [12] "Valor a evaluar  22  prediccion  83.3423076923077"

Tranformacion De Los Datos

ageandheight %>% ggplot(
  aes(
    x = height, 
    y = age    
  )
) +
  geom_point(show.legend = FALSE)  +
  xlab("height") +
  ylab("Edad") 

Coeficiente De Correlacion De Pearson

ageandheight <- ageandheight %>% mutate(logH = log10(height), logA = log10(age))

ageandheight %>% 
  summarise(cor(logH , logA, method = "pearson" ))
## # A tibble: 1 × 1
##   `cor(logH, logA, method = "pearson")`
##                                   <dbl>
## 1                                 0.993
correlation_matrix <- cor(ageandheight[,4:5], use = "complete.obs")

Crear El Gráfico

corrplot(correlation_matrix, method="number", type="upper")

modeloh2 <- ageandheight %>% select(logH,logA) %>%
  lm( ageandheight$logH~ ageandheight$logA, data =.)

summary <- summary(modeloh2)
summary
## 
## Call:
## lm(formula = ageandheight$logH ~ ageandheight$logA, data = .)
## 
## Residuals:
##         Min          1Q      Median          3Q         Max 
## -0.00212345 -0.00122624 -0.00005759  0.00105636  0.00254608 
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)       1.650782   0.009310  177.32 < 0.0000000000000002 ***
## ageandheight$logA 0.183949   0.006806   27.03       0.000000000111 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001527 on 10 degrees of freedom
## Multiple R-squared:  0.9865, Adjusted R-squared:  0.9851 
## F-statistic: 730.4 on 1 and 10 DF,  p-value: 0.0000000001112
intercepto <- modeloh2[["coefficients"]][["(Intercept)"]]
slope <- modeloh2[["coefficients"]][["ageandheight$logA"]]



ggplot(ageandheight, aes(x= logH, y=logA))+
  geom_point() +
  ggtitle( "Regresion linear model") + 
  geom_smooth(method = "lm", color="blue") +
  geom_text(aes(x = min(ageandheight$logH), 
                y = max(ageandheight$logA),
                label = paste("y = ", round(slope, 2), "x +", round(intercepto, 2))), 
            color = "blue", hjust = 0, vjust = 1)
## Warning: Use of `ageandheight$logH` is discouraged.
## ℹ Use `logH` instead.
## Warning: Use of `ageandheight$logA` is discouraged.
## ℹ Use `logA` instead.
## Warning in geom_text(aes(x = min(ageandheight$logH), y = max(ageandheight$logA), : All aesthetics have length 1, but the data has 12 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.
## `geom_smooth()` using formula = 'y ~ x'

Analisis De Residuales

ei2 <- residuals(modeloh2);ei2
##             1             2             3             4             5 
## -0.0003032733  0.0004834811  0.0025460790 -0.0007959430 -0.0011928625 
##             6             7             8             9            10 
##  0.0001880858 -0.0021234523  0.0010894413 -0.0015086420 -0.0013263611 
##            11            12 
##  0.0010453392  0.0018981079
pred2<- sort(fitted(modeloh2));pred2
##        1        2        3        4        5        6        7        8 
## 1.881688 1.886007 1.890105 1.894003 1.897719 1.901270 1.904670 1.907931 
##        9       10       11       12 
## 1.911065 1.914080 1.916985 1.919788

Detencion De Punto Influyentes

plot(cooks.distance(modeloh2))

Validacion Del Modelo

H0 = los errores tiene media cero

HA = los errores no tiene media cero

Si el pe valor es mayor que el nivel de significancia se acepta la hipotesis nula Pvalor > 0,05

t.test(ei2)
## 
##  One Sample t-test
## 
## data:  ei2
## t = 0.000000000000000016119, df = 11, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.0009252468  0.0009252468
## sample estimates:
##                     mean of x 
## 0.000000000000000000006776264

H0 = los errores tienen una distribucion normal

HA = los errores no tienen una distribucion normal

Si el pe valor es mayor que el nivel de significancia se acepta la hipotesis nula Pvalor > 0,05

shapiro.test(ei2)
## 
##  Shapiro-Wilk normality test
## 
## data:  ei2
## W = 0.96503, p-value = 0.8525

H0 = la varianza de los errores es constante

HA = la varianza de los errores no es constante

Si el pe valor es mayor que el nivel de significancia se acepta la hipotesis nula Pvalor > 0,05

bptest(modeloh2)
## 
##  studentized Breusch-Pagan test
## 
## data:  modeloh2
## BP = 0.32857, df = 1, p-value = 0.5665

H0 = los errores son independientes

HA = los errores no son independientes

Si el p-valor es mayor que el nivel de significancia se acepta la hipotesis nula Pvalor > 0,05

dwtest(modeloh2, alternative = "two.sided")
## 
##  Durbin-Watson test
## 
## data:  modeloh2
## DW = 2.0109, p-value = 0.7299
## alternative hypothesis: true autocorrelation is not 0

Simulaciones

fun2 <- function(var){
  logaritmo <-  round(log10(var))
  pred1 <- predict(object = modeloh2, newdata = data.frame(
    logH =logaritmo));pred1
  total=10^(pred1);total
  options(scipen = 999)
  print(paste("Valor a evaluar ", var ," prediccion ", total  ))
  
}


fun2(22)
## Warning: 'newdata' had 1 row but variables found have 12 rows
##  [1] "Valor a evaluar  22  prediccion  76.1531601534288"
##  [2] "Valor a evaluar  22  prediccion  76.9143269489985"
##  [3] "Valor a evaluar  22  prediccion  77.6434733077853"
##  [4] "Valor a evaluar  22  prediccion  78.3434506303829"
##  [5] "Valor a evaluar  22  prediccion  79.0167349077999"
##  [6] "Valor a evaluar  22  prediccion  79.665490719599" 
##  [7] "Valor a evaluar  22  prediccion  80.2916220493512"
##  [8] "Valor a evaluar  22  prediccion  80.8968130626941"
##  [9] "Valor a evaluar  22  prediccion  81.4825611483393"
## [10] "Valor a evaluar  22  prediccion  82.0502039278818"
## [11] "Valor a evaluar  22  prediccion  82.6009415155371"
## [12] "Valor a evaluar  22  prediccion  83.1358550012099"