hrs = read.csv("/Users/Nazija/Desktop/HRS_r1bmi.csv")
hrs.new = data.frame(hrs$r1bmi, hrs$raedyrs)
names(hrs.new)
## [1] "hrs.r1bmi" "hrs.raedyrs"
dim(hrs.new)
## [1] 12652 2
colnames(hrs.new) = c("bmi","edyrs")
hrs.new = na.omit(hrs.new)
attach(hrs.new)
nrow(hrs.new)
## [1] 12651
#dim(hrs.new)
#dim(hrs.new)[1] first eleement of this sequence/vector, number of rows
n.sub = round(nrow(hrs.new)/2)#divide sample size into 2, use round function in case it's not an even number
n.sub
## [1] 6326
Once run the simulation, will populate MSE with 1000 mean squared errors
set.seed(2020) #set random seed, if don't do that, will get slightly different results each time. can do any #, even -
M = 1000 #how many simulations/iterations want to run
MSE = vector() #object to save/store simulation results, MSE is mean squared error, statistic we are using to test fit
There are R packages to help with this
for(i in 1:M){
#partition the data
ix.train = sample(1:nrow(hrs.new), n.sub, replace = F) #temporary object, sample n.sub data from sequence of 1 to sample size
ix.test = which(!1:nrow(hrs.new)%in% ix.train)#ndexes that are FALSE, aka not in training dataset, will be stored in testing
hrs.train = hrs.new[ix.train,]#use the matrix of indices to put hrs data into trainng and testing datasets
hrs.test=hrs.new[ix.test,]
#summary(hrs.train) use summary function to check descriptive stats to make sure the two sets are different
#summary(hrs.test)
#fit model(s) to training data, purpose of cross validation is to compare two models
fit = lm(bmi~edyrs, data = hrs.train)#regression estimation result, estimating using the training data
#summary(fit)
yhat = predict(fit, hrs.test)#predict the values for the testing data
MSE[i] = mean((hrs.test$bmi-yhat)^2)#MSE calculates difference between predicted and actual, and squares, then finds the mean of the squares for each iteration
}
x %in% y means askingif x is in y, boolean
negate the falses into TRUEs and use which function on it- which indices are not in training data
Will want to compare the distributions of the two models- want left most, and narrowest, people will normally just compare mean(MSE) of each model, and var(MSE)
summary(MSE)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 24.56 26.10 26.63 26.59 27.08 28.31
hist(MSE)
Can see that all for objects w/ same edyrs, same predicted (yhat), though actual not the same
cbind(hrs.test, yhat, hrs.test$bmi-yhat)[1:5,]
## bmi edyrs yhat hrs.test$bmi - yhat
## 2 18.5 8 27.75616 -9.2561568
## 4 32.4 16 26.41660 5.9833989
## 6 30.4 16 26.41660 3.9833989
## 7 36.6 16 26.41660 10.1833989
## 8 27.0 16 26.41660 0.5833989