Luis Alfredo Lemus Paz

16001012

This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).

library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
str(faithful)
## 'data.frame':    272 obs. of  2 variables:
##  $ eruptions: num  3.6 1.8 3.33 2.28 4.53 ...
##  $ waiting  : num  79 54 74 62 85 55 88 85 51 85 ...
faithful %>% ggplot(aes(x=eruptions, y=waiting)) + geom_point()

fit<-lm(data=faithful, waiting~eruptions)
summary(fit)
## 
## Call:
## lm(formula = waiting ~ eruptions, data = faithful)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.0796  -4.4831   0.2122   3.9246  15.9719 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  33.4744     1.1549   28.98   <2e-16 ***
## eruptions    10.7296     0.3148   34.09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.914 on 270 degrees of freedom
## Multiple R-squared:  0.8115, Adjusted R-squared:  0.8108 
## F-statistic:  1162 on 1 and 270 DF,  p-value: < 2.2e-16

Cross-Validation

index<-1:nrow(faithful)
shuffle_index<-sample(index)
shuffle_faithful<-faithful[shuffle_index,]
delta<-nrow(faithful)/10
test1<-shuffle_faithful[1:delta,]
train1<-shuffle_faithful[-(1:delta),]
test2<-shuffle_faithful[delta:(2*delta),]
train2<-shuffle_faithful[-(delta:(2*delta)),]
test3<-shuffle_faithful[(2*delta):(3*delta),]
train3<-shuffle_faithful[-((2*delta):(3*delta)),]
test4<-shuffle_faithful[(3*delta):(4*delta),]
train4<-shuffle_faithful[-((3*delta):(4*delta)),]
test5<-shuffle_faithful[(4*delta):(5*delta),]
train5<-shuffle_faithful[-((4*delta):(5*delta)),]
test6<-shuffle_faithful[(5*delta):(6*delta),]
train6<-shuffle_faithful[-((5*delta):(6*delta)),]
test7<-shuffle_faithful[(6*delta):(7*delta),]
train7<-shuffle_faithful[-((6*delta):(7*delta)),]
test8<-shuffle_faithful[(7*delta):(8*delta),]
train8<-shuffle_faithful[-((7*delta):(8*delta)),]
test9<-shuffle_faithful[(8*delta):(9*delta),]
train9<-shuffle_faithful[-((8*delta):(9*delta)),]
test10<-shuffle_faithful[(9*delta):(10*delta),]
train10<-shuffle_faithful[-((9*delta):(10*delta)),]
fit1<-lm(data = train1, waiting~eruptions)
fit2<-lm(data = train2, waiting~eruptions)
fit3<-lm(data = train3, waiting~eruptions)
fit4<-lm(data = train4, waiting~eruptions)
fit5<-lm(data = train5, waiting~eruptions)
fit6<-lm(data = train6, waiting~eruptions)
fit7<-lm(data = train7, waiting~eruptions)
fit8<-lm(data = train8, waiting~eruptions)
fit9<-lm(data = train9, waiting~eruptions)
fit10<-lm(data = train10, waiting~eruptions)
ptest1<-fit1$coefficients[1] + fit1$coefficients[2]*test1$eruptions 
ptest2<-fit2$coefficients[1] + fit2$coefficients[2]*test2$eruptions
ptest3<-fit3$coefficients[1] + fit3$coefficients[2]*test3$eruptions 
ptest4<-fit4$coefficients[1] + fit4$coefficients[2]*test4$eruptions
ptest5<-fit5$coefficients[1] + fit5$coefficients[2]*test5$eruptions 
ptest6<-fit6$coefficients[1] + fit6$coefficients[2]*test6$eruptions
ptest7<-fit7$coefficients[1] + fit7$coefficients[2]*test7$eruptions 
ptest8<-fit8$coefficients[1] + fit8$coefficients[2]*test8$eruptions
ptest9<-fit9$coefficients[1] + fit9$coefficients[2]*test9$eruptions 
ptest10<-fit10$coefficients[1] + fit10$coefficients[2]*test10$eruptions
RMSE1 <- sqrt(sum( (test1$waiting-as.vector(ptest1))^2 )/nrow(test1) )
RMSE2 <- sqrt(sum( (test2$waiting-as.vector(ptest2))^2 )/nrow(test2) )
RMSE3 <- sqrt(sum( (test3$waiting-as.vector(ptest3))^2 )/nrow(test3) )
RMSE4 <- sqrt(sum( (test4$waiting-as.vector(ptest4))^2 )/nrow(test4) )
RMSE5 <- sqrt(sum( (test5$waiting-as.vector(ptest5))^2 )/nrow(test5) )
RMSE6 <- sqrt(sum( (test6$waiting-as.vector(ptest6))^2 )/nrow(test6) )
RMSE7 <- sqrt(sum( (test7$waiting-as.vector(ptest7))^2 )/nrow(test7) )
RMSE8 <- sqrt(sum( (test8$waiting-as.vector(ptest8))^2 )/nrow(test8) )
RMSE9 <- sqrt(sum( (test9$waiting-as.vector(ptest9))^2 )/nrow(test9) )
RMSE10 <- sqrt(sum( (test10$waiting-as.vector(ptest10))^2 )/nrow(test10) )
c(RMSE1, RMSE2, RMSE3, RMSE4, RMSE5, RMSE6, RMSE7, RMSE8, RMSE9, RMSE10) %>% mean()
## [1] 5.869573
summary(fit)
## 
## Call:
## lm(formula = waiting ~ eruptions, data = faithful)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.0796  -4.4831   0.2122   3.9246  15.9719 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  33.4744     1.1549   28.98   <2e-16 ***
## eruptions    10.7296     0.3148   34.09   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.914 on 270 degrees of freedom
## Multiple R-squared:  0.8115, Adjusted R-squared:  0.8108 
## F-statistic:  1162 on 1 and 270 DF,  p-value: < 2.2e-16

Comparacion de los ultimos dos numeros

El RMSE = 5.914 evaluado en lm con respecto al RMSE = 5.846721 generado del Cross-Validation no tiene mucha diferencia por lo que nuestro modelo es aceptable.
str(USArrests)
## 'data.frame':    50 obs. of  4 variables:
##  $ Murder  : num  13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
##  $ Assault : int  236 263 294 190 276 204 110 238 335 211 ...
##  $ UrbanPop: int  58 48 80 50 91 78 77 72 80 60 ...
##  $ Rape    : num  21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
index1<-1:nrow(USArrests)
shuffle_index1<-sample(index1)
shuffle_USArrests<-USArrests[shuffle_index1,]
delta1<-nrow(USArrests)/5
test_a<-shuffle_USArrests[1:delta,]
train_a<-shuffle_USArrests[-(1:delta),]
test_b<-shuffle_USArrests[delta:(2*delta),]
train_b<-shuffle_USArrests[-(delta:(2*delta)),]
test_c<-shuffle_USArrests[(2*delta):(3*delta),]
train_c<-shuffle_USArrests[-((2*delta):(3*delta)),]
test_d<-shuffle_USArrests[(3*delta):(4*delta),]
train_d<-shuffle_USArrests[-((3*delta):(4*delta)),]
test_e<-shuffle_USArrests[(4*delta):(5*delta),]
train_e<-shuffle_USArrests[-((4*delta):(5*delta)),]
fit_a<-lm(data = train_a, UrbanPop~Murder)
fit_b<-lm(data = train_b, UrbanPop~Murder)
fit_c<-lm(data = train_c, UrbanPop~Murder)
fit_d<-lm(data = train_d, UrbanPop~Murder)
fit_e<-lm(data = train_e, UrbanPop~Murder)
ptest_a<-fit_a$coefficients[1] + fit_a$coefficients[2]*test_a$Murder
ptest_b<-fit_b$coefficients[1] + fit_b$coefficients[2]*test_b$Murder
ptest_c<-fit_c$coefficients[1] + fit_c$coefficients[2]*test_c$Murder 
ptest_d<-fit_d$coefficients[1] + fit_d$coefficients[2]*test_d$Murder
ptest_e<-fit_e$coefficients[1] + fit_e$coefficients[2]*test_e$Murder 
RMSE_a <- sqrt(sum( (test_a$UrbanPop-as.vector(ptest_a))^2 )/nrow(test_a) )
RMSE_b <- sqrt(sum( (test_b$UrbanPop-as.vector(ptest_b))^2 )/nrow(test_b) )
RMSE_c <- sqrt(sum( (test_c$UrbanPop-as.vector(ptest_c))^2 )/nrow(test_c) )
RMSE_d <- sqrt(sum( (test_d$UrbanPop-as.vector(ptest_d))^2 )/nrow(test_d) )
RMSE_e <- sqrt(sum( (test_e$UrbanPop-as.vector(ptest_e))^2 )/nrow(test_e) )
c(RMSE_a, RMSE_b, RMSE_c, RMSE_d, RMSE_e)
## [1] 16.54324       NA       NA       NA       NA
result<-lm(data = USArrests, UrbanPop~Murder)
result
## 
## Call:
## lm(formula = UrbanPop ~ Murder, data = USArrests)
## 
## Coefficients:
## (Intercept)       Murder  
##     63.7393       0.2312
summary(result)
## 
## Call:
## lm(formula = UrbanPop ~ Murder, data = USArrests)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -32.248  -9.953   1.255  12.482  25.180 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  63.7393     4.2597  14.963   <2e-16 ***
## Murder        0.2312     0.4785   0.483    0.631    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.59 on 48 degrees of freedom
## Multiple R-squared:  0.00484,    Adjusted R-squared:  -0.01589 
## F-statistic: 0.2335 on 1 and 48 DF,  p-value: 0.6312