library(ISLR)
library(ggplot2)
library(caret)
## Loading required package: lattice
data(Wage)
Wage<- subset(Wage,select=-c(logwage))
##Becasue logwage is the variable we wanna predict, we delete it
summary(Wage)
## year age sex maritl
## Min. :2003 Min. :18.00 1. Male :3000 1. Never Married: 648
## 1st Qu.:2004 1st Qu.:33.75 2. Female: 0 2. Married :2074
## Median :2006 Median :42.00 3. Widowed : 19
## Mean :2006 Mean :42.41 4. Divorced : 204
## 3rd Qu.:2008 3rd Qu.:51.00 5. Separated : 55
## Max. :2009 Max. :80.00
##
## race education region
## 1. White:2480 1. < HS Grad :268 2. Middle Atlantic :3000
## 2. Black: 293 2. HS Grad :971 1. New England : 0
## 3. Asian: 190 3. Some College :650 3. East North Central: 0
## 4. Other: 37 4. College Grad :685 4. West North Central: 0
## 5. Advanced Degree:426 5. South Atlantic : 0
## 6. East South Central: 0
## (Other) : 0
## jobclass health health_ins
## 1. Industrial :1544 1. <=Good : 858 1. Yes:2083
## 2. Information:1456 2. >=Very Good:2142 2. No : 917
##
##
##
##
##
## wage
## Min. : 20.09
## 1st Qu.: 85.38
## Median :104.92
## Mean :111.70
## 3rd Qu.:128.68
## Max. :318.34
##
inTrain<- createDataPartition(y=Wage$wage,p=0.7,list=FALSE)
training<- Wage[inTrain,]
testing<- Wage[-inTrain,]
dim(training)
## [1] 2102 11
dim(testing)
## [1] 898 11
featurePlot(x=training[,c("age","education","jobclass")],y=training$wage,plot="pairs")
#Plot age versus wage colour by jobclass
qplot(age,wage,colour=jobclass,data=training)
qplot(age,wage,colour=education,data=training)
#Fit a linear model
modFit<- train(wage~age+jobclass+education,method="lm",data=training)
finMod<- modFit$finalModel
print(modFit)
## Linear Regression
##
## 2102 samples
## 10 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
##
## Summary of sample sizes: 2102, 2102, 2102, 2102, 2102, 2102, ...
##
## Resampling results
##
## RMSE Rsquared RMSE SD Rsquared SD
## 35.83034 0.2598788 1.611693 0.02021685
##
##
plot(finMod,1,pch=19,cex=0.5,col="#00000010")
#The line is mostly close to zero, so the residual is small. But the outfilier on the top, they might be expalin by some other predictors
qplot(finMod$fitted,finMod$residuals,colour=race,data=training)
#Plot by index
plot(finMod$residuals,pch=19)
#Predicted versus truth in test set
pred<- predict(modFit,testing)
qplot(wage,pred,colour=year,data=testing) #Idealy, it should be a 45 degree straight line
#Also, another evaluation menthod, RMSD, the result is not good
sqrt(sum((predict(modFit,testing)-testing$logwage)^2))
## [1] 0
modFitAll<- train(wage~., data=training,method="lm")
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
pred<- predict(modFitAll, testing)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient
## fit may be misleading
qplot(wage,pred,data=testing)