# install.packages("ISLR")
library(ISLR)
df<-Wage
rownames(df)<-NULL
head(df)
## year age maritl race education region
## 1 2006 18 1. Never Married 1. White 1. < HS Grad 2. Middle Atlantic
## 2 2004 24 1. Never Married 1. White 4. College Grad 2. Middle Atlantic
## 3 2003 45 2. Married 1. White 3. Some College 2. Middle Atlantic
## 4 2003 43 2. Married 3. Asian 4. College Grad 2. Middle Atlantic
## 5 2005 50 4. Divorced 1. White 2. HS Grad 2. Middle Atlantic
## 6 2008 54 2. Married 1. White 4. College Grad 2. Middle Atlantic
## jobclass health health_ins logwage wage
## 1 1. Industrial 1. <=Good 2. No 4.318063 75.04315
## 2 2. Information 2. >=Very Good 2. No 4.255273 70.47602
## 3 1. Industrial 1. <=Good 1. Yes 4.875061 130.98218
## 4 2. Information 2. >=Very Good 1. Yes 5.041393 154.68529
## 5 2. Information 1. <=Good 1. Yes 4.318063 75.04315
## 6 2. Information 2. >=Very Good 1. Yes 4.845098 127.11574
Data preprocessing
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library(ggplot2)
library(stringr)
library(rebus)
##
## Attaching package: 'rebus'
## The following object is masked from 'package:stringr':
##
## regex
## The following object is masked from 'package:ggplot2':
##
## alpha
# Remove all number of columns between 3 and 9
df_new<-df[,3:9]
df1<- df_new %>% mutate_all(funs(sub("2."," ",.))) %>% mutate_all(funs(sub("1.","",.))) %>% mutate_all(funs(sub("3.","",.))) %>% mutate_all(funs(sub("4.","",.))) %>% mutate_all(funs(sub("5.","",.)))%>% mutate_all(funs(sub("<=","",.))) %>% mutate_all(funs(sub(">=","",.)))
# Merge two dataset
df<-data.frame(df[,c(1,2,10,11)],df1)
head(df)
## year age logwage wage maritl race education
## 1 2006 18 4.318063 75.04315 Never Married White < HS Grad
## 2 2004 24 4.255273 70.47602 Never Married White College Grad
## 3 2003 45 4.875061 130.98218 Married White Some College
## 4 2003 43 5.041393 154.68529 Married Asian College Grad
## 5 2005 50 4.318063 75.04315 Divorced White HS Grad
## 6 2008 54 4.845098 127.11574 Married White College Grad
## region jobclass health health_ins
## 1 Middle Atlantic Industrial Good No
## 2 Middle Atlantic Information Very Good No
## 3 Middle Atlantic Industrial Good Yes
## 4 Middle Atlantic Information Very Good Yes
## 5 Middle Atlantic Information Good Yes
## 6 Middle Atlantic Information Very Good Yes
Alternative
df2<-Wage
df3<- df2 %>% mutate_at(3:9,funs(sub("[123456789].","",.)))
head(df3)
## year age maritl race education region
## 1 2006 18 Never Married White < HS Grad Middle Atlantic
## 2 2004 24 Never Married White College Grad Middle Atlantic
## 3 2003 45 Married White Some College Middle Atlantic
## 4 2003 43 Married Asian College Grad Middle Atlantic
## 5 2005 50 Divorced White HS Grad Middle Atlantic
## 6 2008 54 Married White College Grad Middle Atlantic
## jobclass health health_ins logwage wage
## 1 Industrial <=Good No 4.318063 75.04315
## 2 Information >=Very Good No 4.255273 70.47602
## 3 Industrial <=Good Yes 4.875061 130.98218
## 4 Information >=Very Good Yes 5.041393 154.68529
## 5 Information <=Good Yes 4.318063 75.04315
## 6 Information >=Very Good Yes 4.845098 127.11574
Visualizing a single numeric variable
ggplot(data=df3,aes(x=wage)) + geom_histogram(binwidth = 10,color=3,fill=4)+ xlab("Wage") + ylab("Count") + ggtitle("Histogram")
Visualizing histogram by a categorical varaible
ggplot(data=df3,aes(wage)) + geom_histogram(aes(fill=race),binwidth =10,color=1) + labs(x="Wage",y="Count",title="Distribution of Wage") + theme(plot.title = element_text(hjust=0.5))+ scale_fill_brewer(palette = "Set1") + theme(axis.title = element_text(face = "bold")) + theme(plot.title = element_text(face = "bold",size=15)) + facet_wrap(~race, ncol=2)
ggplot(data=df3,aes(x=wage,fill=race)) + geom_histogram(position = "identity",binwidth = 10)
Cotour lines plot
ggplot(Wage,aes(age, wage))+ stat_density2d()
DIsplaying two or more than two variables using scatter matrix
library(ISLR)
attach(College)
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
X <- cbind(Apps, Accept, Enroll, Room.Board, Books)
scatterplotMatrix(X, diagonal=c("boxplot"), reg.line=F, smoother=F, pch=19, cex=0.6, col="blue")
title (main="Scatterplot Matrix of College Attributes", col.main="navy", font.main=4, line = 3)