This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Cmd+Option+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Cmd+Shift+K to preview the HTML file).
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
str(faithful)
## 'data.frame': 272 obs. of 2 variables:
## $ eruptions: num 3.6 1.8 3.33 2.28 4.53 ...
## $ waiting : num 79 54 74 62 85 55 88 85 51 85 ...
faithful %>% ggplot(aes(x=eruptions, y=waiting)) + geom_point()
fit<-lm(data=faithful, waiting~eruptions)
summary(fit)
##
## Call:
## lm(formula = waiting ~ eruptions, data = faithful)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.0796 -4.4831 0.2122 3.9246 15.9719
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.4744 1.1549 28.98 <2e-16 ***
## eruptions 10.7296 0.3148 34.09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.914 on 270 degrees of freedom
## Multiple R-squared: 0.8115, Adjusted R-squared: 0.8108
## F-statistic: 1162 on 1 and 270 DF, p-value: < 2.2e-16
index<-1:nrow(faithful)
shuffle_index<-sample(index)
shuffle_faithful<-faithful[shuffle_index,]
delta<-nrow(faithful)/10
test1<-shuffle_faithful[1:delta,]
train1<-shuffle_faithful[-(1:delta),]
test2<-shuffle_faithful[delta:(2*delta),]
train2<-shuffle_faithful[-(delta:(2*delta)),]
test3<-shuffle_faithful[(2*delta):(3*delta),]
train3<-shuffle_faithful[-((2*delta):(3*delta)),]
test4<-shuffle_faithful[(3*delta):(4*delta),]
train4<-shuffle_faithful[-((3*delta):(4*delta)),]
test5<-shuffle_faithful[(4*delta):(5*delta),]
train5<-shuffle_faithful[-((4*delta):(5*delta)),]
test6<-shuffle_faithful[(5*delta):(6*delta),]
train6<-shuffle_faithful[-((5*delta):(6*delta)),]
test7<-shuffle_faithful[(6*delta):(7*delta),]
train7<-shuffle_faithful[-((6*delta):(7*delta)),]
test8<-shuffle_faithful[(7*delta):(8*delta),]
train8<-shuffle_faithful[-((7*delta):(8*delta)),]
test9<-shuffle_faithful[(8*delta):(9*delta),]
train9<-shuffle_faithful[-((8*delta):(9*delta)),]
test10<-shuffle_faithful[(9*delta):(10*delta),]
train10<-shuffle_faithful[-((9*delta):(10*delta)),]
fit1<-lm(data = train1, waiting~eruptions)
fit2<-lm(data = train2, waiting~eruptions)
fit3<-lm(data = train3, waiting~eruptions)
fit4<-lm(data = train4, waiting~eruptions)
fit5<-lm(data = train5, waiting~eruptions)
fit6<-lm(data = train6, waiting~eruptions)
fit7<-lm(data = train7, waiting~eruptions)
fit8<-lm(data = train8, waiting~eruptions)
fit9<-lm(data = train9, waiting~eruptions)
fit10<-lm(data = train10, waiting~eruptions)
ptest1<-fit1$coefficients[1] + fit1$coefficients[2]*test1$eruptions
ptest2<-fit2$coefficients[1] + fit2$coefficients[2]*test2$eruptions
ptest3<-fit3$coefficients[1] + fit3$coefficients[2]*test3$eruptions
ptest4<-fit4$coefficients[1] + fit4$coefficients[2]*test4$eruptions
ptest5<-fit5$coefficients[1] + fit5$coefficients[2]*test5$eruptions
ptest6<-fit6$coefficients[1] + fit6$coefficients[2]*test6$eruptions
ptest7<-fit7$coefficients[1] + fit7$coefficients[2]*test7$eruptions
ptest8<-fit8$coefficients[1] + fit8$coefficients[2]*test8$eruptions
ptest9<-fit9$coefficients[1] + fit9$coefficients[2]*test9$eruptions
ptest10<-fit10$coefficients[1] + fit10$coefficients[2]*test10$eruptions
RMSE1 <- sqrt(sum( (test1$waiting-as.vector(ptest1))^2 )/nrow(test1) )
RMSE2 <- sqrt(sum( (test2$waiting-as.vector(ptest2))^2 )/nrow(test2) )
RMSE3 <- sqrt(sum( (test3$waiting-as.vector(ptest3))^2 )/nrow(test3) )
RMSE4 <- sqrt(sum( (test4$waiting-as.vector(ptest4))^2 )/nrow(test4) )
RMSE5 <- sqrt(sum( (test5$waiting-as.vector(ptest5))^2 )/nrow(test5) )
RMSE6 <- sqrt(sum( (test6$waiting-as.vector(ptest6))^2 )/nrow(test6) )
RMSE7 <- sqrt(sum( (test7$waiting-as.vector(ptest7))^2 )/nrow(test7) )
RMSE8 <- sqrt(sum( (test8$waiting-as.vector(ptest8))^2 )/nrow(test8) )
RMSE9 <- sqrt(sum( (test9$waiting-as.vector(ptest9))^2 )/nrow(test9) )
RMSE10 <- sqrt(sum( (test10$waiting-as.vector(ptest10))^2 )/nrow(test10) )
c(RMSE1, RMSE2, RMSE3, RMSE4, RMSE5, RMSE6, RMSE7, RMSE8, RMSE9, RMSE10) %>% mean()
## [1] 5.869573
summary(fit)
##
## Call:
## lm(formula = waiting ~ eruptions, data = faithful)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.0796 -4.4831 0.2122 3.9246 15.9719
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.4744 1.1549 28.98 <2e-16 ***
## eruptions 10.7296 0.3148 34.09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.914 on 270 degrees of freedom
## Multiple R-squared: 0.8115, Adjusted R-squared: 0.8108
## F-statistic: 1162 on 1 and 270 DF, p-value: < 2.2e-16
str(USArrests)
## 'data.frame': 50 obs. of 4 variables:
## $ Murder : num 13.2 10 8.1 8.8 9 7.9 3.3 5.9 15.4 17.4 ...
## $ Assault : int 236 263 294 190 276 204 110 238 335 211 ...
## $ UrbanPop: int 58 48 80 50 91 78 77 72 80 60 ...
## $ Rape : num 21.2 44.5 31 19.5 40.6 38.7 11.1 15.8 31.9 25.8 ...
index1<-1:nrow(USArrests)
shuffle_index1<-sample(index1)
shuffle_USArrests<-USArrests[shuffle_index1,]
delta1<-nrow(USArrests)/5
test_a<-shuffle_USArrests[1:delta,]
train_a<-shuffle_USArrests[-(1:delta),]
test_b<-shuffle_USArrests[delta:(2*delta),]
train_b<-shuffle_USArrests[-(delta:(2*delta)),]
test_c<-shuffle_USArrests[(2*delta):(3*delta),]
train_c<-shuffle_USArrests[-((2*delta):(3*delta)),]
test_d<-shuffle_USArrests[(3*delta):(4*delta),]
train_d<-shuffle_USArrests[-((3*delta):(4*delta)),]
test_e<-shuffle_USArrests[(4*delta):(5*delta),]
train_e<-shuffle_USArrests[-((4*delta):(5*delta)),]
fit_a<-lm(data = train_a, UrbanPop~Murder)
fit_b<-lm(data = train_b, UrbanPop~Murder)
fit_c<-lm(data = train_c, UrbanPop~Murder)
fit_d<-lm(data = train_d, UrbanPop~Murder)
fit_e<-lm(data = train_e, UrbanPop~Murder)
ptest_a<-fit_a$coefficients[1] + fit_a$coefficients[2]*test_a$Murder
ptest_b<-fit_b$coefficients[1] + fit_b$coefficients[2]*test_b$Murder
ptest_c<-fit_c$coefficients[1] + fit_c$coefficients[2]*test_c$Murder
ptest_d<-fit_d$coefficients[1] + fit_d$coefficients[2]*test_d$Murder
ptest_e<-fit_e$coefficients[1] + fit_e$coefficients[2]*test_e$Murder
RMSE_a <- sqrt(sum( (test_a$UrbanPop-as.vector(ptest_a))^2 )/nrow(test_a) )
RMSE_b <- sqrt(sum( (test_b$UrbanPop-as.vector(ptest_b))^2 )/nrow(test_b) )
RMSE_c <- sqrt(sum( (test_c$UrbanPop-as.vector(ptest_c))^2 )/nrow(test_c) )
RMSE_d <- sqrt(sum( (test_d$UrbanPop-as.vector(ptest_d))^2 )/nrow(test_d) )
RMSE_e <- sqrt(sum( (test_e$UrbanPop-as.vector(ptest_e))^2 )/nrow(test_e) )
c(RMSE_a, RMSE_b, RMSE_c, RMSE_d, RMSE_e)
## [1] 16.54324 NA NA NA NA
result<-lm(data = USArrests, UrbanPop~Murder)
result
##
## Call:
## lm(formula = UrbanPop ~ Murder, data = USArrests)
##
## Coefficients:
## (Intercept) Murder
## 63.7393 0.2312
summary(result)
##
## Call:
## lm(formula = UrbanPop ~ Murder, data = USArrests)
##
## Residuals:
## Min 1Q Median 3Q Max
## -32.248 -9.953 1.255 12.482 25.180
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 63.7393 4.2597 14.963 <2e-16 ***
## Murder 0.2312 0.4785 0.483 0.631
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 14.59 on 48 degrees of freedom
## Multiple R-squared: 0.00484, Adjusted R-squared: -0.01589
## F-statistic: 0.2335 on 1 and 48 DF, p-value: 0.6312