library(ISLR);library(ggplot2); library(caret); library(gridExtra);
data(Wage)
summary(Wage)
## year age sex maritl
## Min. :2003 Min. :18.00 1. Male :3000 1. Never Married: 648
## 1st Qu.:2004 1st Qu.:33.75 2. Female: 0 2. Married :2074
## Median :2006 Median :42.00 3. Widowed : 19
## Mean :2006 Mean :42.41 4. Divorced : 204
## 3rd Qu.:2008 3rd Qu.:51.00 5. Separated : 55
## Max. :2009 Max. :80.00
##
## race education region
## 1. White:2480 1. < HS Grad :268 2. Middle Atlantic :3000
## 2. Black: 293 2. HS Grad :971 1. New England : 0
## 3. Asian: 190 3. Some College :650 3. East North Central: 0
## 4. Other: 37 4. College Grad :685 4. West North Central: 0
## 5. Advanced Degree:426 5. South Atlantic : 0
## 6. East South Central: 0
## (Other) : 0
## jobclass health health_ins logwage
## 1. Industrial :1544 1. <=Good : 858 1. Yes:2083 Min. :3.000
## 2. Information:1456 2. >=Very Good:2142 2. No : 917 1st Qu.:4.447
## Median :4.653
## Mean :4.654
## 3rd Qu.:4.857
## Max. :5.763
##
## wage
## Min. : 20.09
## 1st Qu.: 85.38
## Median :104.92
## Mean :111.70
## 3rd Qu.:128.68
## Max. :318.34
##
inTrain <- createDataPartition(y=Wage$wage,
p=0.7, list=FALSE)
training <- Wage[inTrain,]
testing <- Wage[-inTrain,]
dim(training); dim(testing)
## [1] 2102 12
## [1] 898 12
featurePlot(x=training[,c("age","education","jobclass")],
y = training$wage,
plot="pairs")
#Qplot (ggplot2 package)
qplot(age,wage,data=training)
qplot(age,wage,colour=jobclass,data=training)
qq <- qplot(age,wage,colour=education,data=training)
qq + geom_smooth(method='lm',formula=y~x)
library(Hmisc)
cutWage <- cut2(training$wage,g=3)
table(cutWage)
## cutWage
## [ 20.1, 91.7) [ 91.7,119.0) [119.0,318.3]
## 703 727 672
p1 <- qplot(cutWage,age, data=training,fill=cutWage,
geom=c("boxplot"))
p1
p2 <- qplot(cutWage,age, data=training,fill=cutWage,
geom=c("boxplot","jitter"))
grid.arrange(p1,p2,ncol=2)
# Tables
t1 <- table(cutWage,training$jobclass)
t1
##
## cutWage 1. Industrial 2. Information
## [ 20.1, 91.7) 454 249
## [ 91.7,119.0) 359 368
## [119.0,318.3] 263 409
prop.table(t1,1)
##
## cutWage 1. Industrial 2. Information
## [ 20.1, 91.7) 0.6458037 0.3541963
## [ 91.7,119.0) 0.4938102 0.5061898
## [119.0,318.3] 0.3913690 0.6086310
qplot(wage,colour=education,data=training,geom="density", lwd = 1)