library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.1
library(dslabs)
## Warning: package 'dslabs' was built under R version 4.4.1
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.4.1
## Cargando paquete requerido: zoo
## Warning: package 'zoo' was built under R version 4.4.1
##
## Adjuntando el paquete: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.1
## corrplot 0.92 loaded
library(reshape2)
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.1
ageandheight <- read_excel("E:/2023_Capacitaciones/BIG DATA UAO/ageandheight.xls")
ageandheight %>% ggplot(
aes(
x = height, # log10(height/10^6),
y = age # log10(age)
)
) +
geom_point(show.legend = FALSE) +
xlab("height") +
ylab("Edad")
# Coeficiente de correlacion de pearson
ageandheight %>%
summarise(cor(height , age, method = "pearson" ))
## # A tibble: 1 × 1
## `cor(height, age, method = "pearson")`
## <dbl>
## 1 0.994
correlation_matrix <- cor(ageandheight[1:2], use = "complete.obs")
# Crear el gráfico
corrplot(correlation_matrix, method="number", type="upper")
modeloh <- ageandheight %>% select(height,age) %>%
lm( ageandheight$height~ ageandheight$age, data =.)
summary <- summary(modeloh)
summary
##
## Call:
## lm(formula = ageandheight$height ~ ageandheight$age, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.27238 -0.24248 -0.02762 0.16014 0.47238
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 64.9283 0.5084 127.71 < 2e-16 ***
## ageandheight$age 0.6350 0.0214 29.66 4.43e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.256 on 10 degrees of freedom
## Multiple R-squared: 0.9888, Adjusted R-squared: 0.9876
## F-statistic: 880 on 1 and 10 DF, p-value: 4.428e-11
intercepto <- modeloh[["coefficients"]][["(Intercept)"]]
slope <- modeloh[["coefficients"]][["ageandheight$age"]]
ggplot(ageandheight, aes(x= height, y=age))+
geom_point() +
ggtitle( "Regresion linear model") +
geom_smooth(method = "lm", color="blue") +
geom_text(aes(x = min(ageandheight$height),
y = max(ageandheight$age),
label = paste("y = ", round(slope, 2), "x +", round(intercepto, 2))),
color = "blue", hjust = 0, vjust = 1)
## Warning: Use of `ageandheight$height` is discouraged.
## ℹ Use `height` instead.
## Warning: Use of `ageandheight$age` is discouraged.
## ℹ Use `age` instead.
## Warning in geom_text(aes(x = min(ageandheight$height), y = max(ageandheight$age), : All aesthetics have length 1, but the data has 12 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
## `geom_smooth()` using formula = 'y ~ x'
## Analisis De Residuales
ei <- residuals(modeloh);ei
## 1 2 3 4 5 6
## -0.257692308 0.007342657 0.472377622 -0.062587413 -0.097552448 0.167482517
## 7 8 9 10 11 12
## -0.267482517 0.297552448 -0.237412587 -0.272377622 0.092657343 0.157692308
pred<- sort(fitted(modeloh));pred
## 1 2 3 4 5 6 7 8
## 76.35769 76.99266 77.62762 78.26259 78.89755 79.53252 80.16748 80.80245
## 9 10 11 12
## 81.43741 82.07238 82.70734 83.34231
plot(cooks.distance(modeloh))
H0 = los errores tiene media cero
HA = los errores no tiene media cero
Si el p-valor es mayor que el nivel de significancia se acepta la hipotesis nula Pvalor > 0,05
t.test(ei)
##
## One Sample t-test
##
## data: ei
## t = -1.5599e-16, df = 11, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.1550632 0.1550632
## sample estimates:
## mean of x
## -1.098997e-17
H0 = los errores tienen una distribucion normal
HA = los errores no tienen una distribucion normal
Si el pe valor es mayor que el nivel de significancia se acepta la hipotesis nula P-valor > 0,05
shapiro.test(ei)
##
## Shapiro-Wilk normality test
##
## data: ei
## W = 0.9234, p-value = 0.3154
H0 = la varianza de los errores es constante
HA = la varianza de los errores no es constante
Si el pe valor es mayor que el nivel de significancia se acepta la hipotesis nula Pvalor > 0,05
bptest(modeloh)
##
## studentized Breusch-Pagan test
##
## data: modeloh
## BP = 0.39003, df = 1, p-value = 0.5323
H0 = los errores son independientes
HA = los errores no son independientes
Si el p-valor es mayor que el nivel de significancia se acepta la hipotesis nula P-valor > 0,05
dwtest(modeloh, alternative = "two.sided")
##
## Durbin-Watson test
##
## data: modeloh
## DW = 2.4078, p-value = 0.7003
## alternative hypothesis: true autocorrelation is not 0
fun <- function(var){
pred1 <- predict(object = modeloh, newdata = data.frame(
height = var))
total= (pred1)
options(scipen = 999)
print(paste("Valor a evaluar ", var ," prediccion ", total ))
}
fun(22)
## Warning: 'newdata' had 1 row but variables found have 12 rows
## [1] "Valor a evaluar 22 prediccion 76.3576923076923"
## [2] "Valor a evaluar 22 prediccion 76.9926573426574"
## [3] "Valor a evaluar 22 prediccion 77.6276223776224"
## [4] "Valor a evaluar 22 prediccion 78.2625874125874"
## [5] "Valor a evaluar 22 prediccion 78.8975524475525"
## [6] "Valor a evaluar 22 prediccion 79.5325174825175"
## [7] "Valor a evaluar 22 prediccion 80.1674825174825"
## [8] "Valor a evaluar 22 prediccion 80.8024475524476"
## [9] "Valor a evaluar 22 prediccion 81.4374125874126"
## [10] "Valor a evaluar 22 prediccion 82.0723776223776"
## [11] "Valor a evaluar 22 prediccion 82.7073426573427"
## [12] "Valor a evaluar 22 prediccion 83.3423076923077"
ageandheight %>% ggplot(
aes(
x = height,
y = age
)
) +
geom_point(show.legend = FALSE) +
xlab("height") +
ylab("Edad")
ageandheight <- ageandheight %>% mutate(logH = log10(height), logA = log10(age))
ageandheight %>%
summarise(cor(logH , logA, method = "pearson" ))
## # A tibble: 1 × 1
## `cor(logH, logA, method = "pearson")`
## <dbl>
## 1 0.993
correlation_matrix <- cor(ageandheight[,4:5], use = "complete.obs")
corrplot(correlation_matrix, method="number", type="upper")
modeloh2 <- ageandheight %>% select(logH,logA) %>%
lm( ageandheight$logH~ ageandheight$logA, data =.)
summary <- summary(modeloh2)
summary
##
## Call:
## lm(formula = ageandheight$logH ~ ageandheight$logA, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.00212345 -0.00122624 -0.00005759 0.00105636 0.00254608
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.650782 0.009310 177.32 < 0.0000000000000002 ***
## ageandheight$logA 0.183949 0.006806 27.03 0.000000000111 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.001527 on 10 degrees of freedom
## Multiple R-squared: 0.9865, Adjusted R-squared: 0.9851
## F-statistic: 730.4 on 1 and 10 DF, p-value: 0.0000000001112
intercepto <- modeloh2[["coefficients"]][["(Intercept)"]]
slope <- modeloh2[["coefficients"]][["ageandheight$logA"]]
ggplot(ageandheight, aes(x= logH, y=logA))+
geom_point() +
ggtitle( "Regresion linear model") +
geom_smooth(method = "lm", color="blue") +
geom_text(aes(x = min(ageandheight$logH),
y = max(ageandheight$logA),
label = paste("y = ", round(slope, 2), "x +", round(intercepto, 2))),
color = "blue", hjust = 0, vjust = 1)
## Warning: Use of `ageandheight$logH` is discouraged.
## ℹ Use `logH` instead.
## Warning: Use of `ageandheight$logA` is discouraged.
## ℹ Use `logA` instead.
## Warning in geom_text(aes(x = min(ageandheight$logH), y = max(ageandheight$logA), : All aesthetics have length 1, but the data has 12 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
## `geom_smooth()` using formula = 'y ~ x'
ei2 <- residuals(modeloh2);ei2
## 1 2 3 4 5
## -0.0003032733 0.0004834811 0.0025460790 -0.0007959430 -0.0011928625
## 6 7 8 9 10
## 0.0001880858 -0.0021234523 0.0010894413 -0.0015086420 -0.0013263611
## 11 12
## 0.0010453392 0.0018981079
pred2<- sort(fitted(modeloh2));pred2
## 1 2 3 4 5 6 7 8
## 1.881688 1.886007 1.890105 1.894003 1.897719 1.901270 1.904670 1.907931
## 9 10 11 12
## 1.911065 1.914080 1.916985 1.919788
plot(cooks.distance(modeloh2))
H0 = los errores tiene media cero
HA = los errores no tiene media cero
Si el pe valor es mayor que el nivel de significancia se acepta la hipotesis nula Pvalor > 0,05
t.test(ei2)
##
## One Sample t-test
##
## data: ei2
## t = 0.000000000000000016119, df = 11, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.0009252468 0.0009252468
## sample estimates:
## mean of x
## 0.000000000000000000006776264
H0 = los errores tienen una distribucion normal
HA = los errores no tienen una distribucion normal
Si el pe valor es mayor que el nivel de significancia se acepta la hipotesis nula Pvalor > 0,05
shapiro.test(ei2)
##
## Shapiro-Wilk normality test
##
## data: ei2
## W = 0.96503, p-value = 0.8525
H0 = la varianza de los errores es constante
HA = la varianza de los errores no es constante
Si el pe valor es mayor que el nivel de significancia se acepta la hipotesis nula Pvalor > 0,05
bptest(modeloh2)
##
## studentized Breusch-Pagan test
##
## data: modeloh2
## BP = 0.32857, df = 1, p-value = 0.5665
H0 = los errores son independientes
HA = los errores no son independientes
Si el p-valor es mayor que el nivel de significancia se acepta la hipotesis nula Pvalor > 0,05
dwtest(modeloh2, alternative = "two.sided")
##
## Durbin-Watson test
##
## data: modeloh2
## DW = 2.0109, p-value = 0.7299
## alternative hypothesis: true autocorrelation is not 0
fun2 <- function(var){
logaritmo <- round(log10(var))
pred1 <- predict(object = modeloh2, newdata = data.frame(
logH =logaritmo));pred1
total=10^(pred1);total
options(scipen = 999)
print(paste("Valor a evaluar ", var ," prediccion ", total ))
}
fun2(22)
## Warning: 'newdata' had 1 row but variables found have 12 rows
## [1] "Valor a evaluar 22 prediccion 76.1531601534288"
## [2] "Valor a evaluar 22 prediccion 76.9143269489985"
## [3] "Valor a evaluar 22 prediccion 77.6434733077853"
## [4] "Valor a evaluar 22 prediccion 78.3434506303829"
## [5] "Valor a evaluar 22 prediccion 79.0167349077999"
## [6] "Valor a evaluar 22 prediccion 79.665490719599"
## [7] "Valor a evaluar 22 prediccion 80.2916220493512"
## [8] "Valor a evaluar 22 prediccion 80.8968130626941"
## [9] "Valor a evaluar 22 prediccion 81.4825611483393"
## [10] "Valor a evaluar 22 prediccion 82.0502039278818"
## [11] "Valor a evaluar 22 prediccion 82.6009415155371"
## [12] "Valor a evaluar 22 prediccion 83.1358550012099"