Example: Wage data

library(ISLR)
library(ggplot2)
library(caret)
## Loading required package: lattice
data(Wage)
summary(Wage) 
##       year           age               sex                    maritl    
##  Min.   :2003   Min.   :18.00   1. Male  :3000   1. Never Married: 648  
##  1st Qu.:2004   1st Qu.:33.75   2. Female:   0   2. Married      :2074  
##  Median :2006   Median :42.00                    3. Widowed      :  19  
##  Mean   :2006   Mean   :42.41                    4. Divorced     : 204  
##  3rd Qu.:2008   3rd Qu.:51.00                    5. Separated    :  55  
##  Max.   :2009   Max.   :80.00                                           
##                                                                         
##        race                   education                     region    
##  1. White:2480   1. < HS Grad      :268   2. Middle Atlantic   :3000  
##  2. Black: 293   2. HS Grad        :971   1. New England       :   0  
##  3. Asian: 190   3. Some College   :650   3. East North Central:   0  
##  4. Other:  37   4. College Grad   :685   4. West North Central:   0  
##                  5. Advanced Degree:426   5. South Atlantic    :   0  
##                                           6. East South Central:   0  
##                                           (Other)              :   0  
##            jobclass               health      health_ins      logwage     
##  1. Industrial :1544   1. <=Good     : 858   1. Yes:2083   Min.   :3.000  
##  2. Information:1456   2. >=Very Good:2142   2. No : 917   1st Qu.:4.447  
##                                                            Median :4.653  
##                                                            Mean   :4.654  
##                                                            3rd Qu.:4.857  
##                                                            Max.   :5.763  
##                                                                           
##       wage       
##  Min.   : 20.09  
##  1st Qu.: 85.38  
##  Median :104.92  
##  Mean   :111.70  
##  3rd Qu.:128.68  
##  Max.   :318.34  
## 

From the summary, we can briefly analyze the data, and find all 3000 observations are male and they all are from Middle Atlantic

inTrain<- createDataPartition(y=Wage$wage,p=0.7,list=FALSE)
training<- Wage[inTrain,]
testing<- Wage[-inTrain,]
dim(training)
## [1] 2102   12
dim(testing)
## [1] 898  12

Now, let’s plot

featurePlot(x=training[,c("age","education","jobclass")],y=training$wage,plot="pairs")

##From the (1,1) and (1,2) plot we find the y has a interesting relationship with age and education. However, the y has no significant relationship with jobclass from the plot. #So, Let’s take a deeper look at the (1,1) first

qplot(age,wage,data=training) #Y~ Age

qplot(age,wage,col=jobclass,data=training)# Y~Age and plus jobclass.

qq<- qplot(age,wage,col=education,data=training)#Y~Age and plus Education level
qq+geom_smooth(method='lm',formula=y~x)#And add the regression trend to indicate the realtion of dots

##Also, we can cut the data and build the box plot

library(Hmisc)
## Loading required package: grid
## Loading required package: survival
## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:caret':
## 
##     cluster
## 
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, round.POSIXt, trunc.POSIXt, units
cutWage<- cut2(training$wage,g=3) #cut the data into 3 factor:High, Middle, Low
table(cutWage)  
## cutWage
## [ 20.1, 92.2) [ 92.2,119.0) [119.0,318.3] 
##           701           727           674
p1<- qplot(cutWage,age,data=training,fill=cutWage,geom=c("boxplot"))
p1  #We can find that the age will be higher by the increase of cutwage, but it is slidely 

p2<- qplot(cutWage,age,data=training,fill=cutWage,geom=c("boxplot","jitter"))
library(gridExtra)
grid.arrange(p1,p2,ncol=2) ##The dots could tell you the distribution of the observation, if one box has only a few dots, which means it does not support strongly 

##In addition, we could use table function

t1<- table(cutWage,training$jobclass)
t1
##                
## cutWage         1. Industrial 2. Information
##   [ 20.1, 92.2)           431            270
##   [ 92.2,119.0)           369            358
##   [119.0,318.3]           265            409
prop.table(t1,1)
##                
## cutWage         1. Industrial 2. Information
##   [ 20.1, 92.2)     0.6148359      0.3851641
##   [ 92.2,119.0)     0.5075653      0.4924347
##   [119.0,318.3]     0.3931751      0.6068249

And, we could use Density plots

qplot(wage,col=education,data=training,geom="density")