library(dplyr)
library(ggplot2)
input_data <- faithful
waiting<-rnorm(n = 272, mean = 10, sd = 1)
eruptions<- 3.14*waiting^5+rnorm(n = 272, mean = 25, sd = 5)
input_data <- data.frame(eruptions, waiting)
names(input_data)
[1] "eruptions" "waiting"
head(input_data)
summary(input_data)
eruptions waiting
Min. : 69423 Min. : 7.394
1st Qu.: 234271 1st Qu.: 9.431
Median : 327120 Median :10.082
Mean : 353039 Mean :10.053
3rd Qu.: 445116 3rd Qu.:10.723
Max. :1133038 Max. :12.926
input_data %>%
ggplot(aes(x=waiting, y=eruptions ) ) +
geom_point()
\[\widehat{eruption}=\hat \beta_1 {waiting}+\hat \beta_0\]
fit<-lm(eruptions~ waiting^5 ,data=input_data)
fit
Call:
lm(formula = eruptions ~ waiting^5, data = input_data)
Coefficients:
(Intercept) waiting
-1326624 167073
str(fit)
List of 12
$ coefficients : Named num [1:2] -1326624 167073
..- attr(*, "names")= chr [1:2] "(Intercept)" "waiting"
$ residuals : Named num [1:272] -29415 -4720 -2267 -30237 -30311 ...
..- attr(*, "names")= chr [1:272] "1" "2" "3" "4" ...
$ effects : Named num [1:272] -5822472 2686455 -761 -28546 -28668 ...
..- attr(*, "names")= chr [1:272] "(Intercept)" "waiting" "" "" ...
$ rank : int 2
$ fitted.values: Named num [1:272] 405069 214325 519380 346928 392084 ...
..- attr(*, "names")= chr [1:272] "1" "2" "3" "4" ...
$ assign : int [1:2] 0 1
$ qr :List of 5
..$ qr : num [1:272, 1:2] -16.4924 0.0606 0.0606 0.0606 0.0606 ...
.. ..- attr(*, "dimnames")=List of 2
.. .. ..$ : chr [1:272] "1" "2" "3" "4" ...
.. .. ..$ : chr [1:2] "(Intercept)" "waiting"
.. ..- attr(*, "assign")= int [1:2] 0 1
..$ qraux: num [1:2] 1.06 1.05
..$ pivot: int [1:2] 1 2
..$ tol : num 1e-07
..$ rank : int 2
..- attr(*, "class")= chr "qr"
$ df.residual : int 270
$ xlevels : Named list()
$ call : language lm(formula = eruptions ~ waiting^5, data = input_data)
$ terms :Classes 'terms', 'formula' language eruptions ~ waiting^5
.. ..- attr(*, "variables")= language list(eruptions, waiting)
.. ..- attr(*, "factors")= int [1:2, 1] 0 1
.. .. ..- attr(*, "dimnames")=List of 2
.. .. .. ..$ : chr [1:2] "eruptions" "waiting"
.. .. .. ..$ : chr "waiting"
.. ..- attr(*, "term.labels")= chr "waiting"
.. ..- attr(*, "order")= int 1
.. ..- attr(*, "intercept")= int 1
.. ..- attr(*, "response")= int 1
.. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
.. ..- attr(*, "predvars")= language list(eruptions, waiting)
.. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
.. .. ..- attr(*, "names")= chr [1:2] "eruptions" "waiting"
$ model :'data.frame': 272 obs. of 2 variables:
..$ eruptions: num [1:272] 375654 209605 517112 316690 361773 ...
..$ waiting : num [1:272] 10.36 9.22 11.05 10.02 10.29 ...
..- attr(*, "terms")=Classes 'terms', 'formula' language eruptions ~ waiting^5
.. .. ..- attr(*, "variables")= language list(eruptions, waiting)
.. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
.. .. .. ..- attr(*, "dimnames")=List of 2
.. .. .. .. ..$ : chr [1:2] "eruptions" "waiting"
.. .. .. .. ..$ : chr "waiting"
.. .. ..- attr(*, "term.labels")= chr "waiting"
.. .. ..- attr(*, "order")= int 1
.. .. ..- attr(*, "intercept")= int 1
.. .. ..- attr(*, "response")= int 1
.. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
.. .. ..- attr(*, "predvars")= language list(eruptions, waiting)
.. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
.. .. .. ..- attr(*, "names")= chr [1:2] "eruptions" "waiting"
- attr(*, "class")= chr "lm"
fit$coefficients
(Intercept) waiting
-1326623.9 167072.9
str(fit$coefficients)
Named num [1:2] -1326624 167073
- attr(*, "names")= chr [1:2] "(Intercept)" "waiting"
coeff <- fit$coefficients
coeff[1]
(Intercept)
-1326624
coeff[2]
waiting
167072.9
eruption_time <- function(x){
out <- coeff[2]*x+coeff[1]
names(out)<- NULL
return(out)
}
eruption_time(120)
[1] 18722126
plot(input_data$waiting,
fit$fitted.values,
col='red',
type="l",
xlab = "Waiting",
ylab = "eruption")
points(input_data$waiting,input_data$eruptions)
plot(input_data$waiting, fit$residuals)
sum(fit$residuals)
[1] -5.513812e-10
\[RMSE=\sqrt{ \sum_{i=1}^{n} (Y_i-\hat Y_i)^2 }\]
rmse<-
fit$residuals^2 %>% sum() %>% sqrt()
rmse
[1] 743401.8
summary(input_data)
eruptions waiting
Min. : 69423 Min. : 7.394
1st Qu.: 234271 1st Qu.: 9.431
Median : 327120 Median :10.082
Mean : 353039 Mean :10.053
3rd Qu.: 445116 3rd Qu.:10.723
Max. :1133038 Max. :12.926
summary(fit)
Call:
lm(formula = eruptions ~ waiting^5, data = input_data)
Residuals:
Min 1Q Median 3Q Max
-30873 -27988 -16568 11295 300095
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1326624 28420 -46.68 <2e-16 ***
waiting 167073 2814 59.38 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 45240 on 270 degrees of freedom
Multiple R-squared: 0.9289, Adjusted R-squared: 0.9286
F-statistic: 3526 on 1 and 270 DF, p-value: < 2.2e-16
dim(input_data)
[1] 272 2
set.seed(161)
train_nrows <- round(nrow(input_data)*0.7)
index_train<-
sample(1:nrow(input_data),size = train_nrows,replace = FALSE)
train <- input_data[index_train,]
test <- input_data[-index_train,]
fit_train <- lm(eruptions~waiting, data = train)
summary(fit_train)
Call:
lm(formula = eruptions ~ waiting, data = train)
Residuals:
Min 1Q Median 3Q Max
-29959 -27773 -19097 10639 286449
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1381945 37144 -37.21 <2e-16 ***
waiting 172408 3662 47.09 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 47770 on 188 degrees of freedom
Multiple R-squared: 0.9218, Adjusted R-squared: 0.9214
F-statistic: 2217 on 1 and 188 DF, p-value: < 2.2e-16
pred_test <- fit_train$coefficients[1]+
fit_train$coefficients[2]*test$waiting
rmse_test<-
(test$eruptions- pred_test)^2 %>% sum() %>% sqrt()
rmse_test
[1] 363072
set.seed(234)
kfolds<-
nrow(input_data)/8
index_mix<-sample(1:nrow(input_data))
out<-matrix(nrow = 1, ncol=34)
for(i in 0:7){
lim_inf <- 34*i+1
lim_sup <- 34*i+34
out<-rbind(out,index_mix[lim_inf:lim_sup])
}
kfolds<-out[-1,]
fit_1 <- lm(eruptions~waiting, data = input_data[-kfolds[1,],])
pred_test_1 <- fit_1$coefficients[1]+
fit_1$coefficients[2]*input_data[kfolds[1,],2]
rmse_test<-
(input_data[kfolds[1,],1]- pred_test_1)^2 %>% sum() %>% sqrt()
rmse_test
[1] 330626.7
out<-c()
for(i in 1:8){
fit_i <- lm(eruptions~waiting, data = input_data[-kfolds[i,],])
pred_test_i <- fit_i$coefficients[1]+
fit_i$coefficients[2]*input_data[kfolds[i,],2]
rmse_test_i<-(input_data[kfolds[i,],1]- pred_test_i)^2 %>% sum() %>% sqrt()
out<-c(out,rmse_test_i)
}
out
[1] 330626.7 240947.3 191698.6 344944.0 302238.0 277204.4
[7] 221595.5 197086.5
summary(out)
Min. 1st Qu. Median Mean 3rd Qu. Max.
191699 215468 259076 263293 309335 344944
sd(out)
[1] 59351.28
mean(out)
[1] 263292.6
\[y = m_1x_1+mx +b\] Genero = 0 M \[y=m_1x+b\]
Genero =1 F \[y=m_1x_1+mx+b\]
\(x_1=\) Periodicos \(x_2=\) radio \(x_3=\) TV
\[y = b_0 + b_1 x_1 + b_2x_2+ b_3x_3\]
\[y = b_0 + b_1 x_1 + b_2x_2+ b_3x_3 + b_4x_2*x_3\]
\[y = b_0 + b_1 x_1 + b_2x_2+ (b_3 + b_4x_2)*x_3\]
\[y = e^{kx}\]
\[ln(y) = kx\]
kfolds( df , k ) -> matrix_folds