Plots and Tests
setwd("C:/Users/Shreyas Jadhav/Downloads")
usincome <- read.csv(paste("project.csv",sep="."))
attach(usincome)
usincome$income_level[usincome$income_level == -50000] <- 'Below 50k'
usincome$income_level[usincome$income_level == 50000] <- 'Above 50k'
Boxplots of the different variables
boxplot(age~class_of_worker,data=usincome,main="Boxplot Representation of Age Vs Class of Worker",xlab="Class of Worker",ylab="Age",las=2)

boxplot(age~income_level,data=usincome,main="Boxplot Representation of Age Vs Income_Level",xlab="Income Level",ylab="Age")

boxplot(industry_code~income_level,data=usincome,main="Boxplot Representation of Industry Code Vs Income_Level",xlab="Income_Level",ylab="Industry")

boxplot(wage_per_hour~income_level,data=usincome,main="Boxplot Representation of Wage per Hour Vs Income_Level",xlab="Income_Level",ylab="Wage per Hour")

Histogram for suitable data fields.
library(lattice)
histogram(~age,data=usincome,main="Distribution of Age",xlab="Age Variation",col="pink")

library(lattice)
histogram(~class_of_worker,data=usincome,main="Distribution of Class of Worker", xlab="Class of Worker",col="green",las=2)

histogram(~occupation_code,main="Occupation Code Frequency Distribution",xlab="Occupation Code",col="gold2")

histogram(~industry_code, main="Industry Code Frequency Distribution",xlab="Industry Code",col="black")

Different plots for data fields.
library(car)
scatterplot(age,income_level,main="Variation in Salary with age",xlab="Age",ylab="Salary (US Dollars)")

library(lattice)
marriage_and_wage<-as.data.frame(table(marital_status,wage_per_hour))
xyplot(marriage_and_wage$marital_status~marriage_and_wage$Freq,type=c("p","g","smooth"),xlab="Wage/Hour",ylab="Frequency of Earning",xlim=c(0,200))

plot(industry_code~wage_per_hour,main="Industry code Vs Wage / Hour")
abline(0,1)

Occ<-as.data.frame(table(education,occupation_code))
library(UsingR)
## Loading required package: MASS
## Loading required package: HistData
## Loading required package: Hmisc
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
##
## Attaching package: 'UsingR'
## The following object is masked from 'package:survival':
##
## cancer
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following object is masked from 'package:MASS':
##
## select
## The following object is masked from 'package:car':
##
## recode
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
g<-ggplot(filter(Occ,Freq>0),aes(x=occupation_code,y=education))
g=g+scale_size(range = c(2,20),guide="none")
g<-g+geom_point(colour="gold2",aes(size=Freq+20,show_guide=FALSE))
## Warning: Ignoring unknown aesthetics: show_guide
g

Correlation Matrix using corrgram
x<-usincome[,c(1,3,4,6,17,18,19,39,41)]
library(corrgram)
cols4<-colorRampPalette(c("peachpuff","lightpink","royalblue3","navyblue"))
corrgram(x, order=FALSE, col.regions=cols4, lower.panel=panel.shade, upper.panel=panel.pie, text.panel=panel.txt, main="Corrgram of variables")

Scatter plot Matrix
library(car)
scatterplotMatrix(formula=~wage_per_hour+class_of_worker+race,cex=0.6,data=usincome,diagonal="density")

Tests
mytable<-xtabs(~race+age,data=usincome)
mytable1<-xtabs(~class_of_worker+marital_status,data=usincome)
mytable2<-xtabs(~major_industry_code+income_level,data=usincome)
cor.test(income_level,age)
##
## Pearson's product-moment correlation
##
## data: income_level and age
## t = 13.223, df = 9998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1117873 0.1503139
## sample estimates:
## cor
## 0.1311001
cor.test(income_level,wage_per_hour)
##
## Pearson's product-moment correlation
##
## data: income_level and wage_per_hour
## t = 2.5488, df = 9998, p-value = 0.01082
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.005885531 0.045060225
## sample estimates:
## cor
## 0.02548266
cor.test(income_level,capital_gains)
##
## Pearson's product-moment correlation
##
## data: income_level and capital_gains
## t = 23.912, df = 9998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2139579 0.2510383
## sample estimates:
## cor
## 0.2325826
cor.test(income_level,capital_losses)
##
## Pearson's product-moment correlation
##
## data: income_level and capital_losses
## t = 14.154, df = 9998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1208917 0.1593220
## sample estimates:
## cor
## 0.1401596
chisq.test(mytable)
## Warning in chisq.test(mytable): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable
## X-squared = 431.81, df = 360, p-value = 0.005527
chisq.test(mytable1)
## Warning in chisq.test(mytable1): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable1
## X-squared = 2079.7, df = 48, p-value < 2.2e-16
chisq.test(mytable2)
## Warning in chisq.test(mytable2): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: mytable2
## X-squared = 776.6, df = 23, p-value < 2.2e-16