library(ISLR)
library(ggplot2)
library(caret)
## Loading required package: lattice
data(Wage)
summary(Wage)
## year age sex maritl
## Min. :2003 Min. :18.00 1. Male :3000 1. Never Married: 648
## 1st Qu.:2004 1st Qu.:33.75 2. Female: 0 2. Married :2074
## Median :2006 Median :42.00 3. Widowed : 19
## Mean :2006 Mean :42.41 4. Divorced : 204
## 3rd Qu.:2008 3rd Qu.:51.00 5. Separated : 55
## Max. :2009 Max. :80.00
##
## race education region
## 1. White:2480 1. < HS Grad :268 2. Middle Atlantic :3000
## 2. Black: 293 2. HS Grad :971 1. New England : 0
## 3. Asian: 190 3. Some College :650 3. East North Central: 0
## 4. Other: 37 4. College Grad :685 4. West North Central: 0
## 5. Advanced Degree:426 5. South Atlantic : 0
## 6. East South Central: 0
## (Other) : 0
## jobclass health health_ins logwage
## 1. Industrial :1544 1. <=Good : 858 1. Yes:2083 Min. :3.000
## 2. Information:1456 2. >=Very Good:2142 2. No : 917 1st Qu.:4.447
## Median :4.653
## Mean :4.654
## 3rd Qu.:4.857
## Max. :5.763
##
## wage
## Min. : 20.09
## 1st Qu.: 85.38
## Median :104.92
## Mean :111.70
## 3rd Qu.:128.68
## Max. :318.34
##
inTrain<- createDataPartition(y=Wage$wage,p=0.7,list=FALSE)
training<- Wage[inTrain,]
testing<- Wage[-inTrain,]
dim(training)
## [1] 2102 12
dim(testing)
## [1] 898 12
featurePlot(x=training[,c("age","education","jobclass")],y=training$wage,plot="pairs")
##From the (1,1) and (1,2) plot we find the y has a interesting relationship with age and education. However, the y has no significant relationship with jobclass from the plot. #So, Let’s take a deeper look at the (1,1) first
qplot(age,wage,data=training) #Y~ Age
qplot(age,wage,col=jobclass,data=training)# Y~Age and plus jobclass.
qq<- qplot(age,wage,col=education,data=training)#Y~Age and plus Education level
qq+geom_smooth(method='lm',formula=y~x)#And add the regression trend to indicate the realtion of dots
##Also, we can cut the data and build the box plot
library(Hmisc)
## Loading required package: grid
## Loading required package: survival
##
## Attaching package: 'survival'
##
## The following object is masked from 'package:caret':
##
## cluster
##
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:base':
##
## format.pval, round.POSIXt, trunc.POSIXt, units
cutWage<- cut2(training$wage,g=3) #cut the data into 3 factor:High, Middle, Low
table(cutWage)
## cutWage
## [ 20.1, 92.2) [ 92.2,119.0) [119.0,318.3]
## 701 727 674
p1<- qplot(cutWage,age,data=training,fill=cutWage,geom=c("boxplot"))
p1 #We can find that the age will be higher by the increase of cutwage, but it is slidely
p2<- qplot(cutWage,age,data=training,fill=cutWage,geom=c("boxplot","jitter"))
library(gridExtra)
grid.arrange(p1,p2,ncol=2) ##The dots could tell you the distribution of the observation, if one box has only a few dots, which means it does not support strongly
##In addition, we could use table function
t1<- table(cutWage,training$jobclass)
t1
##
## cutWage 1. Industrial 2. Information
## [ 20.1, 92.2) 431 270
## [ 92.2,119.0) 369 358
## [119.0,318.3] 265 409
prop.table(t1,1)
##
## cutWage 1. Industrial 2. Information
## [ 20.1, 92.2) 0.6148359 0.3851641
## [ 92.2,119.0) 0.5075653 0.4924347
## [119.0,318.3] 0.3931751 0.6068249
qplot(wage,col=education,data=training,geom="density")