# install.packages("ISLR")

library(ISLR)

df<-Wage

rownames(df)<-NULL

head(df)
##   year age           maritl     race       education             region
## 1 2006  18 1. Never Married 1. White    1. < HS Grad 2. Middle Atlantic
## 2 2004  24 1. Never Married 1. White 4. College Grad 2. Middle Atlantic
## 3 2003  45       2. Married 1. White 3. Some College 2. Middle Atlantic
## 4 2003  43       2. Married 3. Asian 4. College Grad 2. Middle Atlantic
## 5 2005  50      4. Divorced 1. White      2. HS Grad 2. Middle Atlantic
## 6 2008  54       2. Married 1. White 4. College Grad 2. Middle Atlantic
##         jobclass         health health_ins  logwage      wage
## 1  1. Industrial      1. <=Good      2. No 4.318063  75.04315
## 2 2. Information 2. >=Very Good      2. No 4.255273  70.47602
## 3  1. Industrial      1. <=Good     1. Yes 4.875061 130.98218
## 4 2. Information 2. >=Very Good     1. Yes 5.041393 154.68529
## 5 2. Information      1. <=Good     1. Yes 4.318063  75.04315
## 6 2. Information 2. >=Very Good     1. Yes 4.845098 127.11574

Data preprocessing

library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library(ggplot2)

library(stringr)

library(rebus)
## 
## Attaching package: 'rebus'
## The following object is masked from 'package:stringr':
## 
##     regex
## The following object is masked from 'package:ggplot2':
## 
##     alpha
# Remove all number of columns between 3 and 9

df_new<-df[,3:9]

df1<- df_new %>% mutate_all(funs(sub("2."," ",.))) %>% mutate_all(funs(sub("1.","",.))) %>% mutate_all(funs(sub("3.","",.))) %>% mutate_all(funs(sub("4.","",.))) %>% mutate_all(funs(sub("5.","",.)))%>% mutate_all(funs(sub("<=","",.))) %>% mutate_all(funs(sub(">=","",.)))

# Merge two dataset

df<-data.frame(df[,c(1,2,10,11)],df1)

head(df)
##   year age  logwage      wage         maritl   race     education
## 1 2006  18 4.318063  75.04315  Never Married  White     < HS Grad
## 2 2004  24 4.255273  70.47602  Never Married  White  College Grad
## 3 2003  45 4.875061 130.98218        Married  White  Some College
## 4 2003  43 5.041393 154.68529        Married  Asian  College Grad
## 5 2005  50 4.318063  75.04315       Divorced  White       HS Grad
## 6 2008  54 4.845098 127.11574        Married  White  College Grad
##              region      jobclass      health health_ins
## 1   Middle Atlantic    Industrial        Good         No
## 2   Middle Atlantic   Information   Very Good         No
## 3   Middle Atlantic    Industrial        Good        Yes
## 4   Middle Atlantic   Information   Very Good        Yes
## 5   Middle Atlantic   Information        Good        Yes
## 6   Middle Atlantic   Information   Very Good        Yes

Alternative

df2<-Wage

df3<- df2 %>% mutate_at(3:9,funs(sub("[123456789].","",.)))

head(df3)
##   year age         maritl   race     education           region
## 1 2006  18  Never Married  White     < HS Grad  Middle Atlantic
## 2 2004  24  Never Married  White  College Grad  Middle Atlantic
## 3 2003  45        Married  White  Some College  Middle Atlantic
## 4 2003  43        Married  Asian  College Grad  Middle Atlantic
## 5 2005  50       Divorced  White       HS Grad  Middle Atlantic
## 6 2008  54        Married  White  College Grad  Middle Atlantic
##       jobclass       health health_ins  logwage      wage
## 1   Industrial       <=Good         No 4.318063  75.04315
## 2  Information  >=Very Good         No 4.255273  70.47602
## 3   Industrial       <=Good        Yes 4.875061 130.98218
## 4  Information  >=Very Good        Yes 5.041393 154.68529
## 5  Information       <=Good        Yes 4.318063  75.04315
## 6  Information  >=Very Good        Yes 4.845098 127.11574

Visualizing a single numeric variable

ggplot(data=df3,aes(x=wage)) + geom_histogram(binwidth = 10,color=3,fill=4)+ xlab("Wage") + ylab("Count") + ggtitle("Histogram")

Visualizing histogram by a categorical varaible

ggplot(data=df3,aes(wage)) + geom_histogram(aes(fill=race),binwidth =10,color=1) + labs(x="Wage",y="Count",title="Distribution of Wage") +  theme(plot.title = element_text(hjust=0.5))+ scale_fill_brewer(palette = "Set1") + theme(axis.title = element_text(face = "bold")) + theme(plot.title = element_text(face = "bold",size=15)) + facet_wrap(~race, ncol=2)

ggplot(data=df3,aes(x=wage,fill=race)) + geom_histogram(position = "identity",binwidth = 10)

Cotour lines plot

ggplot(Wage,aes(age, wage))+ stat_density2d()

DIsplaying two or more than two variables using scatter matrix

library(ISLR)
attach(College)
library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
X <- cbind(Apps, Accept, Enroll, Room.Board, Books)
scatterplotMatrix(X, diagonal=c("boxplot"), reg.line=F, smoother=F, pch=19, cex=0.6, col="blue")
title (main="Scatterplot Matrix of College Attributes", col.main="navy", font.main=4, line = 3)