Name: Shreyas Jadhav

Email: essjeyy0412@gmail.com

College:Dwarkadas J.Sanghvi College of Engineering

Part 2:

Plots and Tests

setwd("C:/Users/Shreyas Jadhav/Downloads")  
usincome <- read.csv(paste("project.csv",sep="."))
attach(usincome)
usincome$income_level[usincome$income_level == -50000] <- 'Below 50k'
usincome$income_level[usincome$income_level == 50000] <- 'Above 50k'

Boxplots of the different variables

boxplot(age~class_of_worker,data=usincome,main="Boxplot Representation of Age Vs Class of Worker",xlab="Class of Worker",ylab="Age",las=2)

boxplot(age~income_level,data=usincome,main="Boxplot Representation of Age Vs Income_Level",xlab="Income Level",ylab="Age")

boxplot(industry_code~income_level,data=usincome,main="Boxplot Representation of Industry Code Vs Income_Level",xlab="Income_Level",ylab="Industry")

boxplot(wage_per_hour~income_level,data=usincome,main="Boxplot Representation of Wage per Hour Vs Income_Level",xlab="Income_Level",ylab="Wage per Hour")

Histogram for suitable data fields.

library(lattice)
histogram(~age,data=usincome,main="Distribution of Age",xlab="Age Variation",col="pink")

library(lattice)
histogram(~class_of_worker,data=usincome,main="Distribution of Class of Worker", xlab="Class of Worker",col="green",las=2)

histogram(~occupation_code,main="Occupation Code Frequency Distribution",xlab="Occupation Code",col="gold2")

histogram(~industry_code,  main="Industry Code Frequency Distribution",xlab="Industry Code",col="black")

Different plots for data fields.

library(car)
scatterplot(age,income_level,main="Variation in Salary with age",xlab="Age",ylab="Salary (US Dollars)")

library(lattice)
marriage_and_wage<-as.data.frame(table(marital_status,wage_per_hour))
xyplot(marriage_and_wage$marital_status~marriage_and_wage$Freq,type=c("p","g","smooth"),xlab="Wage/Hour",ylab="Frequency of Earning",xlim=c(0,200))

plot(industry_code~wage_per_hour,main="Industry code Vs Wage / Hour")
abline(0,1)

Occ<-as.data.frame(table(education,occupation_code))
library(UsingR)
## Loading required package: MASS
## Loading required package: HistData
## Loading required package: Hmisc
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## 
## Attaching package: 'UsingR'
## The following object is masked from 'package:survival':
## 
##     cancer
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
## 
##     src, summarize
## The following object is masked from 'package:MASS':
## 
##     select
## The following object is masked from 'package:car':
## 
##     recode
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
g<-ggplot(filter(Occ,Freq>0),aes(x=occupation_code,y=education))
g=g+scale_size(range = c(2,20),guide="none")
g<-g+geom_point(colour="gold2",aes(size=Freq+20,show_guide=FALSE))
## Warning: Ignoring unknown aesthetics: show_guide
g

Correlation Matrix using corrgram

x<-usincome[,c(1,3,4,6,17,18,19,39,41)]
library(corrgram)
cols4<-colorRampPalette(c("peachpuff","lightpink","royalblue3","navyblue"))
corrgram(x, order=FALSE, col.regions=cols4, lower.panel=panel.shade, upper.panel=panel.pie, text.panel=panel.txt, main="Corrgram of variables")

Scatter plot Matrix

library(car)
scatterplotMatrix(formula=~wage_per_hour+class_of_worker+race,cex=0.6,data=usincome,diagonal="density")

Tests

mytable<-xtabs(~race+age,data=usincome)
mytable1<-xtabs(~class_of_worker+marital_status,data=usincome)
mytable2<-xtabs(~major_industry_code+income_level,data=usincome)
cor.test(income_level,age)
## 
##  Pearson's product-moment correlation
## 
## data:  income_level and age
## t = 13.223, df = 9998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1117873 0.1503139
## sample estimates:
##       cor 
## 0.1311001
cor.test(income_level,wage_per_hour)
## 
##  Pearson's product-moment correlation
## 
## data:  income_level and wage_per_hour
## t = 2.5488, df = 9998, p-value = 0.01082
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.005885531 0.045060225
## sample estimates:
##        cor 
## 0.02548266
cor.test(income_level,capital_gains)
## 
##  Pearson's product-moment correlation
## 
## data:  income_level and capital_gains
## t = 23.912, df = 9998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2139579 0.2510383
## sample estimates:
##       cor 
## 0.2325826
cor.test(income_level,capital_losses)
## 
##  Pearson's product-moment correlation
## 
## data:  income_level and capital_losses
## t = 14.154, df = 9998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1208917 0.1593220
## sample estimates:
##       cor 
## 0.1401596
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable
## X-squared = 431.81, df = 360, p-value = 0.005527
chisq.test(mytable1)
## Warning in chisq.test(mytable1): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable1
## X-squared = 2079.7, df = 48, p-value < 2.2e-16
chisq.test(mytable2)
## Warning in chisq.test(mytable2): Chi-squared approximation may be incorrect
## 
##  Pearson's Chi-squared test
## 
## data:  mytable2
## X-squared = 776.6, df = 23, p-value < 2.2e-16