library(foreign)
wages<-read.dta("WAGE2.dta")
View(wages)
dim(wages)
## [1] 935 17
str(wages)
## 'data.frame': 935 obs. of 17 variables:
## $ wage : int 769 808 825 650 562 1400 600 1081 1154 1000 ...
## $ hours : int 40 50 40 40 40 40 40 40 45 40 ...
## $ IQ : int 93 119 108 96 74 116 91 114 111 95 ...
## $ KWW : int 35 41 46 32 27 43 24 50 37 44 ...
## $ educ : int 12 18 14 12 11 16 10 18 15 12 ...
## $ exper : int 11 11 11 13 14 14 13 8 13 16 ...
## $ tenure : int 2 16 9 7 5 2 0 14 1 16 ...
## $ age : int 31 37 33 32 34 35 30 38 36 36 ...
## $ married: int 1 1 1 1 1 1 0 1 1 1 ...
## $ black : int 0 0 0 0 0 1 0 0 0 0 ...
## $ south : int 0 0 0 0 0 0 0 0 0 0 ...
## $ urban : int 1 1 1 1 1 1 1 1 0 1 ...
## $ sibs : int 1 1 1 4 10 1 1 2 2 1 ...
## $ brthord: int 2 NA 2 3 6 2 2 3 3 1 ...
## $ meduc : int 8 14 14 12 6 8 8 8 14 12 ...
## $ feduc : int 8 14 14 12 11 NA 8 NA 5 11 ...
## $ lwage : num 6.65 6.69 6.72 6.48 6.33 ...
## - attr(*, "datalabel")= chr ""
## - attr(*, "time.stamp")= chr "14 Apr 1999 13:41"
## - attr(*, "formats")= chr "%9.0g" "%9.0g" "%9.0g" "%9.0g" ...
## - attr(*, "types")= int 105 98 105 98 98 98 98 98 98 98 ...
## - attr(*, "val.labels")= chr "" "" "" "" ...
## - attr(*, "var.labels")= chr "monthly earnings" "average weekly hours" "IQ score" "knowledge of world work score" ...
## - attr(*, "version")= int 5
library(psych)
describe(wages)[,c(3,4,5,8,9)]
## mean sd median min max
## wage 957.95 404.36 905.00 115.00 3078.00
## hours 43.93 7.22 40.00 20.00 80.00
## IQ 101.28 15.05 102.00 50.00 145.00
## KWW 35.74 7.64 37.00 12.00 56.00
## educ 13.47 2.20 12.00 9.00 18.00
## exper 11.56 4.37 11.00 1.00 23.00
## tenure 7.23 5.08 7.00 0.00 22.00
## age 33.08 3.11 33.00 28.00 38.00
## married 0.89 0.31 1.00 0.00 1.00
## black 0.13 0.33 0.00 0.00 1.00
## south 0.34 0.47 0.00 0.00 1.00
## urban 0.72 0.45 1.00 0.00 1.00
## sibs 2.94 2.31 2.00 0.00 14.00
## brthord 2.28 1.60 2.00 1.00 10.00
## meduc 10.68 2.85 12.00 0.00 18.00
## feduc 10.22 3.30 10.00 0.00 18.00
## lwage 6.78 0.42 6.81 4.74 8.03
What are the factors affecting the wage of a person?
Note: In this analysis we neglect the factors {black,south,feduc,sibs,meduc,brthord} for simplicity of analysis.
str(wages)
## 'data.frame': 935 obs. of 17 variables:
## $ wage : int 769 808 825 650 562 1400 600 1081 1154 1000 ...
## $ hours : int 40 50 40 40 40 40 40 40 45 40 ...
## $ IQ : int 93 119 108 96 74 116 91 114 111 95 ...
## $ KWW : int 35 41 46 32 27 43 24 50 37 44 ...
## $ educ : int 12 18 14 12 11 16 10 18 15 12 ...
## $ exper : int 11 11 11 13 14 14 13 8 13 16 ...
## $ tenure : int 2 16 9 7 5 2 0 14 1 16 ...
## $ age : int 31 37 33 32 34 35 30 38 36 36 ...
## $ married: int 1 1 1 1 1 1 0 1 1 1 ...
## $ black : int 0 0 0 0 0 1 0 0 0 0 ...
## $ south : int 0 0 0 0 0 0 0 0 0 0 ...
## $ urban : int 1 1 1 1 1 1 1 1 0 1 ...
## $ sibs : int 1 1 1 4 10 1 1 2 2 1 ...
## $ brthord: int 2 NA 2 3 6 2 2 3 3 1 ...
## $ meduc : int 8 14 14 12 6 8 8 8 14 12 ...
## $ feduc : int 8 14 14 12 11 NA 8 NA 5 11 ...
## $ lwage : num 6.65 6.69 6.72 6.48 6.33 ...
## - attr(*, "datalabel")= chr ""
## - attr(*, "time.stamp")= chr "14 Apr 1999 13:41"
## - attr(*, "formats")= chr "%9.0g" "%9.0g" "%9.0g" "%9.0g" ...
## - attr(*, "types")= int 105 98 105 98 98 98 98 98 98 98 ...
## - attr(*, "val.labels")= chr "" "" "" "" ...
## - attr(*, "var.labels")= chr "monthly earnings" "average weekly hours" "IQ score" "knowledge of world work score" ...
## - attr(*, "version")= int 5
wages$married[wages$married==0]<-'Unmarried'
wages$married[wages$married==1]<-'Married'
wages$married<-factor(wages$married)
wages$urban[wages$urban==0]<-'Rural'
wages$urban[wages$urban==1]<-'Urban'
wages$urban<-factor(wages$urban)
str(wages)
## 'data.frame': 935 obs. of 17 variables:
## $ wage : int 769 808 825 650 562 1400 600 1081 1154 1000 ...
## $ hours : int 40 50 40 40 40 40 40 40 45 40 ...
## $ IQ : int 93 119 108 96 74 116 91 114 111 95 ...
## $ KWW : int 35 41 46 32 27 43 24 50 37 44 ...
## $ educ : int 12 18 14 12 11 16 10 18 15 12 ...
## $ exper : int 11 11 11 13 14 14 13 8 13 16 ...
## $ tenure : int 2 16 9 7 5 2 0 14 1 16 ...
## $ age : int 31 37 33 32 34 35 30 38 36 36 ...
## $ married: Factor w/ 2 levels "Married","Unmarried": 1 1 1 1 1 1 2 1 1 1 ...
## $ black : int 0 0 0 0 0 1 0 0 0 0 ...
## $ south : int 0 0 0 0 0 0 0 0 0 0 ...
## $ urban : Factor w/ 2 levels "Rural","Urban": 2 2 2 2 2 2 2 2 1 2 ...
## $ sibs : int 1 1 1 4 10 1 1 2 2 1 ...
## $ brthord: int 2 NA 2 3 6 2 2 3 3 1 ...
## $ meduc : int 8 14 14 12 6 8 8 8 14 12 ...
## $ feduc : int 8 14 14 12 11 NA 8 NA 5 11 ...
## $ lwage : num 6.65 6.69 6.72 6.48 6.33 ...
## - attr(*, "datalabel")= chr ""
## - attr(*, "time.stamp")= chr "14 Apr 1999 13:41"
## - attr(*, "formats")= chr "%9.0g" "%9.0g" "%9.0g" "%9.0g" ...
## - attr(*, "types")= int 105 98 105 98 98 98 98 98 98 98 ...
## - attr(*, "val.labels")= chr "" "" "" "" ...
## - attr(*, "var.labels")= chr "monthly earnings" "average weekly hours" "IQ score" "knowledge of world work score" ...
## - attr(*, "version")= int 5
table(wages$married)
##
## Married Unmarried
## 835 100
table(wages$urban)
##
## Rural Urban
## 264 671
attach(wages)
mytable<-table(married,urban)
prop.table(mytable,1)*100
## urban
## married Rural Urban
## Married 28.86228 71.13772
## Unmarried 23.00000 77.00000
#Distribution of wages vs hours
boxplot(wage~hours,data = wages,main="Distribution of Wages with number of hours worked per week",xlab="Wages",ylab="Number of hours worked per week",horizontal=TRUE)
#Distribution of wages vs IQ
boxplot(wage~IQ,data = wages,main="Distribution of Wages with IQ",xlab="Wages",ylab="IQ",horizontal=TRUE)
#Distribution of wages vs KWW
boxplot(wage~KWW,data = wages,main="Distribution of Wages with KWW",xlab="Wages",ylab="Knowledge of Work World Score",horizontal=TRUE)
#Distribution of wages vs Education
boxplot(wage~educ,data = wages,main="Distribution of Wages with Education",xlab="Wages",ylab="Years of education",horizontal=TRUE)
#Distribution of wages vs Work Experience
boxplot(wage~exper,data = wages,main="Distribution of Wages with Work Experience",xlab="Wages",ylab="Years of work experience",horizontal=TRUE)
#Distribution of wages vs Tenure served
boxplot(wage~tenure,data = wages,main="Distribution of Wages with Tenure",xlab="Wages",ylab="Tenure",horizontal=TRUE)
#Distribution of wages vs Age
boxplot(wage~age,data = wages,main="Distribution of Wages with Age",xlab="Wages",ylab="Age",horizontal=TRUE)
#Distribution of wages vs Marital Status
boxplot(wage~married,data = wages,main="Distribution of Wages with Marital Status",xlab="Wages",ylab="Marital Status",horizontal=TRUE)
#Distribution of wages vs Location
boxplot(wage~urban,data = wages,main="Distribution of Wages with Location",xlab="Wages",ylab="Location",horizontal=TRUE)
library(lattice)
histogram(wage,main="Distribution of wage")
histogram(hours,main="Distribution of hours worked")
histogram(IQ,main="Distribution of IQ")
histogram(KWW, main="Distribution of KWW")
histogram(educ, main="Distribution of Years of Education")
histogram(exper, main="Distribution of Years of Work experience")
histogram(tenure, main="Distribution of Tenure")
histogram(age, main="Distribution of age")
histogram(married, main="Distribution of marital status")
histogram(urban, main="Distribution of location")
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(wage~hours,main="Scatterplot of Wage with No. of hours worked",xlab="No. of hours worked")
scatterplot(wage~IQ,main="Scatterplot of Wage with IQ")
scatterplot(wage~KWW,main="Scatterplot of Wage with KWW")
scatterplot(wage~educ,main="Scatterplot of Wage with Years of education",xlab="No. of years of education")
scatterplot(wage~exper,main="Scatterplot of Wage with Work experience",xlab = "No. of years of work experience")
scatterplot(wage~tenure,main="Scatterplot of Wage with Tenure")
scatterplot(wage~age,main="Scatterplot of Wage with Age")
dataColumns<-wages[,c(1:8)]
res<-cor(dataColumns)
round(res,2)
## wage hours IQ KWW educ exper tenure age
## wage 1.00 -0.01 0.31 0.33 0.33 0.00 0.13 0.16
## hours -0.01 1.00 0.07 0.11 0.09 -0.06 -0.06 0.02
## IQ 0.31 0.07 1.00 0.41 0.52 -0.22 0.04 -0.04
## KWW 0.33 0.11 0.41 1.00 0.39 0.02 0.14 0.39
## educ 0.33 0.09 0.52 0.39 1.00 -0.46 -0.04 -0.01
## exper 0.00 -0.06 -0.22 0.02 -0.46 1.00 0.24 0.50
## tenure 0.13 -0.06 0.04 0.14 -0.04 0.24 1.00 0.27
## age 0.16 0.02 -0.04 0.39 -0.01 0.50 0.27 1.00
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3
dataColumns<-wages[,c(1:8)]
res<-cor(dataColumns)
corrgram(res,order = FALSE,upper.panel = panel.pie,lower.panel = panel.shade,text.panel = panel.txt,main="Wages data corrgram")
scatterplotMatrix(~wage+hours+IQ+KWW+educ+exper+tenure+age,main="Scatterplot matrix of variables")
scatterplotMatrix(~wage+hours+IQ+KWW+educ+exper+tenure+age|married, main="Scatterplot matrix of variables divided on the basis of marital status")
scatterplotMatrix(~wage+hours+IQ+KWW+educ+exper+tenure+age|urban, main="Scatterplot matrix of variables divided on the basis of location")
library(psych)
describe(wages)[,c(11:12)]
## skew kurtosis
## wage 1.20 2.68
## hours 1.59 4.14
## IQ -0.34 -0.03
## KWW -0.29 -0.33
## educ 0.55 -0.74
## exper 0.08 -0.57
## tenure 0.43 -0.81
## age 0.12 -1.26
## married* 2.54 4.45
## black 2.22 2.93
## south 0.67 -1.55
## urban* -0.97 -1.07
## sibs 1.44 2.73
## brthord 1.75 3.50
## meduc -0.50 0.92
## feduc -0.04 -0.04
## lwage -0.27 0.51
We see that apart from “married” and “black”, all variables satisfy the skewness and kurtosis test, and hence are normally distributed.
Also, there are no more categorical variables of relevance. So Pearson’s chi-squared test and t-test are not applicable.