Librerias

library(dplyr)

## 
## Adjuntando el paquete: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.4.1

library(dslabs)

## Warning: package 'dslabs' was built under R version 4.4.1

library(lmtest)

## Warning: package 'lmtest' was built under R version 4.4.1

## Cargando paquete requerido: zoo

## Warning: package 'zoo' was built under R version 4.4.1

## 
## Adjuntando el paquete: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(corrplot)

## Warning: package 'corrplot' was built under R version 4.4.1

## corrplot 0.92 loaded

library(reshape2)
library(readxl)

## Warning: package 'readxl' was built under R version 4.4.1

Dataset

ageandheight <- read_excel("E:/2023_Capacitaciones/BIG DATA UAO/ageandheight.xls")

ageandheight %>% ggplot(
                    aes(
                        x = height, # log10(height/10^6), 
                        y = age    # log10(age) 
                        )
                    ) +
            geom_point(show.legend = FALSE)  +
            xlab("height") +
            ylab("Edad")

# Coeficiente de correlacion de pearson

ageandheight %>% 
  summarise(cor(height , age, method = "pearson" ))

## # A tibble: 1 × 1
##   `cor(height, age, method = "pearson")`
##                                    <dbl>
## 1                                  0.994

correlation_matrix <- cor(ageandheight[1:2], use = "complete.obs")


# Crear el gráfico
corrplot(correlation_matrix, method="number", type="upper")

modeloh <- ageandheight %>% select(height,age) %>%
                lm( ageandheight$height~ ageandheight$age, data =.)

summary <- summary(modeloh)
summary

## 
## Call:
## lm(formula = ageandheight$height ~ ageandheight$age, data = .)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.27238 -0.24248 -0.02762  0.16014  0.47238 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       64.9283     0.5084  127.71  < 2e-16 ***
## ageandheight$age   0.6350     0.0214   29.66 4.43e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.256 on 10 degrees of freedom
## Multiple R-squared:  0.9888, Adjusted R-squared:  0.9876 
## F-statistic:   880 on 1 and 10 DF,  p-value: 4.428e-11

intercepto <- modeloh[["coefficients"]][["(Intercept)"]]
slope <- modeloh[["coefficients"]][["ageandheight$age"]]




ggplot(ageandheight, aes(x= height, y=age))+
              geom_point() +
              ggtitle( "Regresion linear model") + 
              geom_smooth(method = "lm", color="blue") +
              geom_text(aes(x = min(ageandheight$height), 
                            y = max(ageandheight$age),
                            label = paste("y = ", round(slope, 2), "x +", round(intercepto, 2))), 
                        color = "blue", hjust = 0, vjust = 1)

## Warning: Use of `ageandheight$height` is discouraged.
## ℹ Use `height` instead.

## Warning: Use of `ageandheight$age` is discouraged.
## ℹ Use `age` instead.

## Warning in geom_text(aes(x = min(ageandheight$height), y = max(ageandheight$age), : All aesthetics have length 1, but the data has 12 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

## `geom_smooth()` using formula = 'y ~ x'

## Analisis De Residuales

ei <- residuals(modeloh);ei

##            1            2            3            4            5            6 
## -0.257692308  0.007342657  0.472377622 -0.062587413 -0.097552448  0.167482517 
##            7            8            9           10           11           12 
## -0.267482517  0.297552448 -0.237412587 -0.272377622  0.092657343  0.157692308

pred<- sort(fitted(modeloh));pred

##        1        2        3        4        5        6        7        8 
## 76.35769 76.99266 77.62762 78.26259 78.89755 79.53252 80.16748 80.80245 
##        9       10       11       12 
## 81.43741 82.07238 82.70734 83.34231

Detencion De Punto Influyentes

plot(cooks.distance(modeloh))

Validacion Del Modelo

H0 = los errores tiene media cero

HA = los errores no tiene media cero

Si el p-valor es mayor que el nivel de significancia se acepta la hipotesis nula Pvalor > 0,05

t.test(ei)

## 
##  One Sample t-test
## 
## data:  ei
## t = -1.5599e-16, df = 11, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.1550632  0.1550632
## sample estimates:
##     mean of x 
## -1.098997e-17

H0 = los errores tienen una distribucion normal

HA = los errores no tienen una distribucion normal

Si el pe valor es mayor que el nivel de significancia se acepta la hipotesis nula P-valor > 0,05

shapiro.test(ei)

## 
##  Shapiro-Wilk normality test
## 
## data:  ei
## W = 0.9234, p-value = 0.3154

H0 = la varianza de los errores es constante

HA = la varianza de los errores no es constante

Si el pe valor es mayor que el nivel de significancia se acepta la hipotesis nula Pvalor > 0,05

bptest(modeloh)

## 
##  studentized Breusch-Pagan test
## 
## data:  modeloh
## BP = 0.39003, df = 1, p-value = 0.5323

H0 = los errores son independientes

HA = los errores no son independientes

Si el p-valor es mayor que el nivel de significancia se acepta la hipotesis nula P-valor > 0,05

dwtest(modeloh, alternative = "two.sided")

## 
##  Durbin-Watson test
## 
## data:  modeloh
## DW = 2.4078, p-value = 0.7003
## alternative hypothesis: true autocorrelation is not 0

Simulaciones

fun <- function(var){
  
  pred1 <- predict(object = modeloh, newdata = data.frame(
    height = var))
  total= (pred1)
  options(scipen = 999)
  print(paste("Valor a evaluar ", var ," prediccion ", total  ))
  
}

fun(22)

## Warning: 'newdata' had 1 row but variables found have 12 rows

##  [1] "Valor a evaluar  22  prediccion  76.3576923076923"
##  [2] "Valor a evaluar  22  prediccion  76.9926573426574"
##  [3] "Valor a evaluar  22  prediccion  77.6276223776224"
##  [4] "Valor a evaluar  22  prediccion  78.2625874125874"
##  [5] "Valor a evaluar  22  prediccion  78.8975524475525"
##  [6] "Valor a evaluar  22  prediccion  79.5325174825175"
##  [7] "Valor a evaluar  22  prediccion  80.1674825174825"
##  [8] "Valor a evaluar  22  prediccion  80.8024475524476"
##  [9] "Valor a evaluar  22  prediccion  81.4374125874126"
## [10] "Valor a evaluar  22  prediccion  82.0723776223776"
## [11] "Valor a evaluar  22  prediccion  82.7073426573427"
## [12] "Valor a evaluar  22  prediccion  83.3423076923077"

Tranformacion De Los Datos

ageandheight %>% ggplot(
  aes(
    x = height, 
    y = age    
  )
) +
  geom_point(show.legend = FALSE)  +
  xlab("height") +
  ylab("Edad")

Coeficiente De Correlacion De Pearson

ageandheight <- ageandheight %>% mutate(logH = log10(height), logA = log10(age))

ageandheight %>% 
  summarise(cor(logH , logA, method = "pearson" ))

## # A tibble: 1 × 1
##   `cor(logH, logA, method = "pearson")`
##                                   <dbl>
## 1                                 0.993

correlation_matrix <- cor(ageandheight[,4:5], use = "complete.obs")

Crear El Gráfico

corrplot(correlation_matrix, method="number", type="upper")

modeloh2 <- ageandheight %>% select(logH,logA) %>%
  lm( ageandheight$logH~ ageandheight$logA, data =.)

summary <- summary(modeloh2)
summary

## 
## Call:
## lm(formula = ageandheight$logH ~ ageandheight$logA, data = .)
## 
## Residuals:
##         Min          1Q      Median          3Q         Max 
## -0.00212345 -0.00122624 -0.00005759  0.00105636  0.00254608 
## 
## Coefficients:
##                   Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)       1.650782   0.009310  177.32 < 0.0000000000000002 ***
## ageandheight$logA 0.183949   0.006806   27.03       0.000000000111 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001527 on 10 degrees of freedom
## Multiple R-squared:  0.9865, Adjusted R-squared:  0.9851 
## F-statistic: 730.4 on 1 and 10 DF,  p-value: 0.0000000001112

intercepto <- modeloh2[["coefficients"]][["(Intercept)"]]
slope <- modeloh2[["coefficients"]][["ageandheight$logA"]]



ggplot(ageandheight, aes(x= logH, y=logA))+
  geom_point() +
  ggtitle( "Regresion linear model") + 
  geom_smooth(method = "lm", color="blue") +
  geom_text(aes(x = min(ageandheight$logH), 
                y = max(ageandheight$logA),
                label = paste("y = ", round(slope, 2), "x +", round(intercepto, 2))), 
            color = "blue", hjust = 0, vjust = 1)

## Warning: Use of `ageandheight$logH` is discouraged.
## ℹ Use `logH` instead.

## Warning: Use of `ageandheight$logA` is discouraged.
## ℹ Use `logA` instead.

## Warning in geom_text(aes(x = min(ageandheight$logH), y = max(ageandheight$logA), : All aesthetics have length 1, but the data has 12 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

## `geom_smooth()` using formula = 'y ~ x'

Analisis De Residuales

ei2 <- residuals(modeloh2);ei2

##             1             2             3             4             5 
## -0.0003032733  0.0004834811  0.0025460790 -0.0007959430 -0.0011928625 
##             6             7             8             9            10 
##  0.0001880858 -0.0021234523  0.0010894413 -0.0015086420 -0.0013263611 
##            11            12 
##  0.0010453392  0.0018981079

pred2<- sort(fitted(modeloh2));pred2

##        1        2        3        4        5        6        7        8 
## 1.881688 1.886007 1.890105 1.894003 1.897719 1.901270 1.904670 1.907931 
##        9       10       11       12 
## 1.911065 1.914080 1.916985 1.919788

Detencion De Punto Influyentes

plot(cooks.distance(modeloh2))

Validacion Del Modelo

H0 = los errores tiene media cero

HA = los errores no tiene media cero

Si el pe valor es mayor que el nivel de significancia se acepta la hipotesis nula Pvalor > 0,05

t.test(ei2)

## 
##  One Sample t-test
## 
## data:  ei2
## t = 0.000000000000000016119, df = 11, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.0009252468  0.0009252468
## sample estimates:
##                     mean of x 
## 0.000000000000000000006776264

H0 = los errores tienen una distribucion normal

HA = los errores no tienen una distribucion normal

Si el pe valor es mayor que el nivel de significancia se acepta la hipotesis nula Pvalor > 0,05

shapiro.test(ei2)

## 
##  Shapiro-Wilk normality test
## 
## data:  ei2
## W = 0.96503, p-value = 0.8525

H0 = la varianza de los errores es constante

HA = la varianza de los errores no es constante

Si el pe valor es mayor que el nivel de significancia se acepta la hipotesis nula Pvalor > 0,05

bptest(modeloh2)

## 
##  studentized Breusch-Pagan test
## 
## data:  modeloh2
## BP = 0.32857, df = 1, p-value = 0.5665

H0 = los errores son independientes

HA = los errores no son independientes

Si el p-valor es mayor que el nivel de significancia se acepta la hipotesis nula Pvalor > 0,05

dwtest(modeloh2, alternative = "two.sided")

## 
##  Durbin-Watson test
## 
## data:  modeloh2
## DW = 2.0109, p-value = 0.7299
## alternative hypothesis: true autocorrelation is not 0

Simulaciones

fun2 <- function(var){
  logaritmo <-  round(log10(var))
  pred1 <- predict(object = modeloh2, newdata = data.frame(
    logH =logaritmo));pred1
  total=10^(pred1);total
  options(scipen = 999)
  print(paste("Valor a evaluar ", var ," prediccion ", total  ))
  
}


fun2(22)

## Warning: 'newdata' had 1 row but variables found have 12 rows

##  [1] "Valor a evaluar  22  prediccion  76.1531601534288"
##  [2] "Valor a evaluar  22  prediccion  76.9143269489985"
##  [3] "Valor a evaluar  22  prediccion  77.6434733077853"
##  [4] "Valor a evaluar  22  prediccion  78.3434506303829"
##  [5] "Valor a evaluar  22  prediccion  79.0167349077999"
##  [6] "Valor a evaluar  22  prediccion  79.665490719599" 
##  [7] "Valor a evaluar  22  prediccion  80.2916220493512"
##  [8] "Valor a evaluar  22  prediccion  80.8968130626941"
##  [9] "Valor a evaluar  22  prediccion  81.4825611483393"
## [10] "Valor a evaluar  22  prediccion  82.0502039278818"
## [11] "Valor a evaluar  22  prediccion  82.6009415155371"
## [12] "Valor a evaluar  22  prediccion  83.1358550012099"

Taller Regresion Lineal

Diego Aguilar

2024-08-20

Librerias

Dataset

Detencion De Punto Influyentes

Validacion Del Modelo

Simulaciones

Tranformacion De Los Datos

Coeficiente De Correlacion De Pearson

Crear El Gráfico

Analisis De Residuales

Detencion De Punto Influyentes

Validacion Del Modelo

Simulaciones