library(foreign)
wages<-read.dta("WAGE2.dta")
View(wages)
dim(wages)
## [1] 935 17
str(wages)
## 'data.frame': 935 obs. of 17 variables:
## $ wage : int 769 808 825 650 562 1400 600 1081 1154 1000 ...
## $ hours : int 40 50 40 40 40 40 40 40 45 40 ...
## $ IQ : int 93 119 108 96 74 116 91 114 111 95 ...
## $ KWW : int 35 41 46 32 27 43 24 50 37 44 ...
## $ educ : int 12 18 14 12 11 16 10 18 15 12 ...
## $ exper : int 11 11 11 13 14 14 13 8 13 16 ...
## $ tenure : int 2 16 9 7 5 2 0 14 1 16 ...
## $ age : int 31 37 33 32 34 35 30 38 36 36 ...
## $ married: int 1 1 1 1 1 1 0 1 1 1 ...
## $ black : int 0 0 0 0 0 1 0 0 0 0 ...
## $ south : int 0 0 0 0 0 0 0 0 0 0 ...
## $ urban : int 1 1 1 1 1 1 1 1 0 1 ...
## $ sibs : int 1 1 1 4 10 1 1 2 2 1 ...
## $ brthord: int 2 NA 2 3 6 2 2 3 3 1 ...
## $ meduc : int 8 14 14 12 6 8 8 8 14 12 ...
## $ feduc : int 8 14 14 12 11 NA 8 NA 5 11 ...
## $ lwage : num 6.65 6.69 6.72 6.48 6.33 ...
## - attr(*, "datalabel")= chr ""
## - attr(*, "time.stamp")= chr "14 Apr 1999 13:41"
## - attr(*, "formats")= chr "%9.0g" "%9.0g" "%9.0g" "%9.0g" ...
## - attr(*, "types")= int 105 98 105 98 98 98 98 98 98 98 ...
## - attr(*, "val.labels")= chr "" "" "" "" ...
## - attr(*, "var.labels")= chr "monthly earnings" "average weekly hours" "IQ score" "knowledge of world work score" ...
## - attr(*, "version")= int 5
library(psych)
describe(wages)[,c(3,4,5,8,9)]
## mean sd median min max
## wage 957.95 404.36 905.00 115.00 3078.00
## hours 43.93 7.22 40.00 20.00 80.00
## IQ 101.28 15.05 102.00 50.00 145.00
## KWW 35.74 7.64 37.00 12.00 56.00
## educ 13.47 2.20 12.00 9.00 18.00
## exper 11.56 4.37 11.00 1.00 23.00
## tenure 7.23 5.08 7.00 0.00 22.00
## age 33.08 3.11 33.00 28.00 38.00
## married 0.89 0.31 1.00 0.00 1.00
## black 0.13 0.33 0.00 0.00 1.00
## south 0.34 0.47 0.00 0.00 1.00
## urban 0.72 0.45 1.00 0.00 1.00
## sibs 2.94 2.31 2.00 0.00 14.00
## brthord 2.28 1.60 2.00 1.00 10.00
## meduc 10.68 2.85 12.00 0.00 18.00
## feduc 10.22 3.30 10.00 0.00 18.00
## lwage 6.78 0.42 6.81 4.74 8.03
What are the factors affecting the wage of a person?
Note: In this analysis we neglect the factors {black,south,feduc,sibs,meduc,brthord} for simplicity of analysis.
str(wages)
## 'data.frame': 935 obs. of 17 variables:
## $ wage : int 769 808 825 650 562 1400 600 1081 1154 1000 ...
## $ hours : int 40 50 40 40 40 40 40 40 45 40 ...
## $ IQ : int 93 119 108 96 74 116 91 114 111 95 ...
## $ KWW : int 35 41 46 32 27 43 24 50 37 44 ...
## $ educ : int 12 18 14 12 11 16 10 18 15 12 ...
## $ exper : int 11 11 11 13 14 14 13 8 13 16 ...
## $ tenure : int 2 16 9 7 5 2 0 14 1 16 ...
## $ age : int 31 37 33 32 34 35 30 38 36 36 ...
## $ married: int 1 1 1 1 1 1 0 1 1 1 ...
## $ black : int 0 0 0 0 0 1 0 0 0 0 ...
## $ south : int 0 0 0 0 0 0 0 0 0 0 ...
## $ urban : int 1 1 1 1 1 1 1 1 0 1 ...
## $ sibs : int 1 1 1 4 10 1 1 2 2 1 ...
## $ brthord: int 2 NA 2 3 6 2 2 3 3 1 ...
## $ meduc : int 8 14 14 12 6 8 8 8 14 12 ...
## $ feduc : int 8 14 14 12 11 NA 8 NA 5 11 ...
## $ lwage : num 6.65 6.69 6.72 6.48 6.33 ...
## - attr(*, "datalabel")= chr ""
## - attr(*, "time.stamp")= chr "14 Apr 1999 13:41"
## - attr(*, "formats")= chr "%9.0g" "%9.0g" "%9.0g" "%9.0g" ...
## - attr(*, "types")= int 105 98 105 98 98 98 98 98 98 98 ...
## - attr(*, "val.labels")= chr "" "" "" "" ...
## - attr(*, "var.labels")= chr "monthly earnings" "average weekly hours" "IQ score" "knowledge of world work score" ...
## - attr(*, "version")= int 5
wages$married[wages$married==0]<-'Unmarried'
wages$married[wages$married==1]<-'Married'
wages$married<-factor(wages$married)
wages$urban[wages$urban==0]<-'Rural'
wages$urban[wages$urban==1]<-'Urban'
wages$urban<-factor(wages$urban)
str(wages)
## 'data.frame': 935 obs. of 17 variables:
## $ wage : int 769 808 825 650 562 1400 600 1081 1154 1000 ...
## $ hours : int 40 50 40 40 40 40 40 40 45 40 ...
## $ IQ : int 93 119 108 96 74 116 91 114 111 95 ...
## $ KWW : int 35 41 46 32 27 43 24 50 37 44 ...
## $ educ : int 12 18 14 12 11 16 10 18 15 12 ...
## $ exper : int 11 11 11 13 14 14 13 8 13 16 ...
## $ tenure : int 2 16 9 7 5 2 0 14 1 16 ...
## $ age : int 31 37 33 32 34 35 30 38 36 36 ...
## $ married: Factor w/ 2 levels "Married","Unmarried": 1 1 1 1 1 1 2 1 1 1 ...
## $ black : int 0 0 0 0 0 1 0 0 0 0 ...
## $ south : int 0 0 0 0 0 0 0 0 0 0 ...
## $ urban : Factor w/ 2 levels "Rural","Urban": 2 2 2 2 2 2 2 2 1 2 ...
## $ sibs : int 1 1 1 4 10 1 1 2 2 1 ...
## $ brthord: int 2 NA 2 3 6 2 2 3 3 1 ...
## $ meduc : int 8 14 14 12 6 8 8 8 14 12 ...
## $ feduc : int 8 14 14 12 11 NA 8 NA 5 11 ...
## $ lwage : num 6.65 6.69 6.72 6.48 6.33 ...
## - attr(*, "datalabel")= chr ""
## - attr(*, "time.stamp")= chr "14 Apr 1999 13:41"
## - attr(*, "formats")= chr "%9.0g" "%9.0g" "%9.0g" "%9.0g" ...
## - attr(*, "types")= int 105 98 105 98 98 98 98 98 98 98 ...
## - attr(*, "val.labels")= chr "" "" "" "" ...
## - attr(*, "var.labels")= chr "monthly earnings" "average weekly hours" "IQ score" "knowledge of world work score" ...
## - attr(*, "version")= int 5
table(wages$married)
##
## Married Unmarried
## 835 100
table(wages$urban)
##
## Rural Urban
## 264 671
attach(wages)
mytable<-table(married,urban)
prop.table(mytable,1)*100
## urban
## married Rural Urban
## Married 28.86228 71.13772
## Unmarried 23.00000 77.00000
#Distribution of wages vs hours
boxplot(wage~hours,data = wages,main="Distribution of Wages with number of hours worked per week",xlab="Wages",ylab="Number of hours worked per week",horizontal=TRUE)
#Distribution of wages vs IQ
boxplot(wage~IQ,data = wages,main="Distribution of Wages with IQ",xlab="Wages",ylab="IQ",horizontal=TRUE)
#Distribution of wages vs KWW
boxplot(wage~KWW,data = wages,main="Distribution of Wages with KWW",xlab="Wages",ylab="Knowledge of Work World Score",horizontal=TRUE)
#Distribution of wages vs Education
boxplot(wage~educ,data = wages,main="Distribution of Wages with Education",xlab="Wages",ylab="Years of education",horizontal=TRUE)
#Distribution of wages vs Work Experience
boxplot(wage~exper,data = wages,main="Distribution of Wages with Work Experience",xlab="Wages",ylab="Years of work experience",horizontal=TRUE)
#Distribution of wages vs Tenure served
boxplot(wage~tenure,data = wages,main="Distribution of Wages with Tenure",xlab="Wages",ylab="Tenure",horizontal=TRUE)
#Distribution of wages vs Age
boxplot(wage~age,data = wages,main="Distribution of Wages with Age",xlab="Wages",ylab="Age",horizontal=TRUE)
#Distribution of wages vs Marital Status
boxplot(wage~married,data = wages,main="Distribution of Wages with Marital Status",xlab="Wages",ylab="Marital Status",horizontal=TRUE)
#Distribution of wages vs Location
boxplot(wage~urban,data = wages,main="Distribution of Wages with Location",xlab="Wages",ylab="Location",horizontal=TRUE)
library(lattice)
histogram(wage,main="Distribution of wage")
histogram(hours,main="Distribution of hours worked")
histogram(IQ,main="Distribution of IQ")
histogram(KWW, main="Distribution of KWW")
histogram(educ, main="Distribution of Years of Education")
histogram(exper, main="Distribution of Years of Work experience")
histogram(tenure, main="Distribution of Tenure")
histogram(age, main="Distribution of age")
histogram(married, main="Distribution of marital status")
histogram(urban, main="Distribution of location")
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(wage~hours,main="Scatterplot of Wage with No. of hours worked",xlab="No. of hours worked")
scatterplot(wage~IQ,main="Scatterplot of Wage with IQ")
scatterplot(wage~KWW,main="Scatterplot of Wage with KWW")
scatterplot(wage~educ,main="Scatterplot of Wage with Years of education",xlab="No. of years of education")
scatterplot(wage~exper,main="Scatterplot of Wage with Work experience",xlab = "No. of years of work experience")
scatterplot(wage~tenure,main="Scatterplot of Wage with Tenure")
scatterplot(wage~age,main="Scatterplot of Wage with Age")
dataColumns<-wages[,c(1:8)]
res<-cor(dataColumns)
round(res,2)
## wage hours IQ KWW educ exper tenure age
## wage 1.00 -0.01 0.31 0.33 0.33 0.00 0.13 0.16
## hours -0.01 1.00 0.07 0.11 0.09 -0.06 -0.06 0.02
## IQ 0.31 0.07 1.00 0.41 0.52 -0.22 0.04 -0.04
## KWW 0.33 0.11 0.41 1.00 0.39 0.02 0.14 0.39
## educ 0.33 0.09 0.52 0.39 1.00 -0.46 -0.04 -0.01
## exper 0.00 -0.06 -0.22 0.02 -0.46 1.00 0.24 0.50
## tenure 0.13 -0.06 0.04 0.14 -0.04 0.24 1.00 0.27
## age 0.16 0.02 -0.04 0.39 -0.01 0.50 0.27 1.00
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3
dataColumns<-wages[,c(1:8)]
res<-cor(dataColumns)
corrgram(res,order = FALSE,upper.panel = panel.pie,lower.panel = panel.shade,text.panel = panel.txt,main="Wages data corrgram")
scatterplotMatrix(~wage+hours+IQ+KWW+educ+exper+tenure+age,main="Scatterplot matrix of variables")
scatterplotMatrix(~wage+hours+IQ+KWW+educ+exper+tenure+age|married, main="Scatterplot matrix of variables divided on the basis of marital status")
scatterplotMatrix(~wage+hours+IQ+KWW+educ+exper+tenure+age|urban, main="Scatterplot matrix of variables divided on the basis of location")
library(psych)
describe(wages)[,c(11:12)]
## skew kurtosis
## wage 1.20 2.68
## hours 1.59 4.14
## IQ -0.34 -0.03
## KWW -0.29 -0.33
## educ 0.55 -0.74
## exper 0.08 -0.57
## tenure 0.43 -0.81
## age 0.12 -1.26
## married* 2.54 4.45
## black 2.22 2.93
## south 0.67 -1.55
## urban* -0.97 -1.07
## sibs 1.44 2.73
## brthord 1.75 3.50
## meduc -0.50 0.92
## feduc -0.04 -0.04
## lwage -0.27 0.51
We see that apart from “married” and “black”, all variables satisfy the skewness and kurtosis test, and hence are normally distributed.
Also, there are no more categorical variables of relevance. So Pearson’s chi-squared test and t-test are not applicable.
dataColumns<-wages[,c(1:8)]
res<-cor(dataColumns)
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.4.3
## corrplot 0.84 loaded
corrplot(res,method="circle")
cor.test(wage,IQ,method = "pearson")
##
## Pearson's product-moment correlation
##
## data: wage and IQ
## t = 9.9272, df = 933, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2499278 0.3659487
## sample estimates:
## cor
## 0.3090878
Since the p-value < 0.05, so we can reject the null hypothesis that the wage and IQ are independent of each other. Also, since the correlation coefficient 0.309 is positive(>0), wage increases with increase in IQ.
cor.test(wage,KWW,method = "pearson")
##
## Pearson's product-moment correlation
##
## data: wage and KWW
## t = 10.538, df = 933, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2676134 0.3822507
## sample estimates:
## cor
## 0.3261306
Since the p-value < 0.05, so we can reject the null hypothesis that the wage and KWW are independent of each other. Also, since the correlation coefficient 0.326 is positive(>0), wage increases with increase in KWW.
cor.test(wage,educ,method = "pearson")
##
## Pearson's product-moment correlation
##
## data: wage and educ
## t = 10.573, df = 933, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2686296 0.3831853
## sample estimates:
## cor
## 0.3271087
Since the p-value < 0.05, so we can reject the null hypothesis that the wage and education are independent of each other. Also, since the correlation coefficient 0.327 is positive(>0), wage increases with increase in education.
cor.test(wage,hours,method = "pearson")
##
## Pearson's product-moment correlation
##
## data: wage and hours
## t = -0.29032, df = 933, p-value = 0.7716
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.07357217 0.05464169
## sample estimates:
## cor
## -0.009504302
Since the p-value > 0.05, we fail to reject the null hypothesis that the wage is independent of the number of hours worked. So, surprisingly enough, the number of hours worked has no significant effect on the wage.
dataColumns<-wages[,c(2:8)]
res<-cor(dataColumns,method = "pearson")
corrplot(res)
round(res,2)
## hours IQ KWW educ exper tenure age
## hours 1.00 0.07 0.11 0.09 -0.06 -0.06 0.02
## IQ 0.07 1.00 0.41 0.52 -0.22 0.04 -0.04
## KWW 0.11 0.41 1.00 0.39 0.02 0.14 0.39
## educ 0.09 0.52 0.39 1.00 -0.46 -0.04 -0.01
## exper -0.06 -0.22 0.02 -0.46 1.00 0.24 0.50
## tenure -0.06 0.04 0.14 -0.04 0.24 1.00 0.27
## age 0.02 -0.04 0.39 -0.01 0.50 0.27 1.00
#Educational factors
#The variables related to academics are highly correlated
acad<-wages[,c(3,4,5)]
acadplot<-cor(acad)
corrplot(acadplot)
acad<-wages[,c(3,4,5)]
acadplot<-cor(acad)
round(acadplot,2)
## IQ KWW educ
## IQ 1.00 0.41 0.52
## KWW 0.41 1.00 0.39
## educ 0.52 0.39 1.00
#The IQ is highly correlated with KWW and education(educ)
#We will include 'KWW' and 'educ' in our regression, but exclude 'IQ' from our regression.
#Other factors(Work Experience, Tenure and Age)
#The variables 'exper','tenure' and 'age' are highly correlated
other<-wages[,c(6:8)]
otherplot<-cor(other)
corrplot(otherplot)
other<-wages[,c(6:8)]
otherplot<-cor(other)
round(otherplot,2)
## exper tenure age
## exper 1.00 0.24 0.50
## tenure 0.24 1.00 0.27
## age 0.50 0.27 1.00
#These variables are highly correlated with each other, but 'exper' and 'tenure' are weakly correlated with each other.
#Therefore, in our regression, we will include 'exper' and 'tenure' but exclude 'age'.
#SUMMARY OF MODEL SELECTION
#Given the above discussion, the dependent variable is 'wage'
#The independent variables we will include in the regression are {KWW,educ,exper,tenure}
columns=c("wage","KWW","educ","exper","tenure")
modelvar<-wages[,columns]
res<-cor(modelvar)
round(res,2)
## wage KWW educ exper tenure
## wage 1.00 0.33 0.33 0.00 0.13
## KWW 0.33 1.00 0.39 0.02 0.14
## educ 0.33 0.39 1.00 -0.46 -0.04
## exper 0.00 0.02 -0.46 1.00 0.24
## tenure 0.13 0.14 -0.04 0.24 1.00
library(corrplot)
columns=c("wage","KWW","educ","exper","tenure")
modelvar<-wages[,columns]
res<-cor(modelvar)
corrplot(res)
#SCATTER PLOTS
library(car)
scatterplotMatrix(~wage+KWW+educ,data = wages,main="Distribution of wages with educational qualifications")
library(car)
scatterplotMatrix(~wage+exper+tenure,data = wages,main="Distribution of wages with work experience and tenure")
model1<-wage~KWW+educ+exper+tenure+married+urban
fit1<-lm(model1, data = wages)
summary(fit1)
##
## Call:
## lm(formula = model1, data = wages)
##
## Residuals:
## Min 1Q Median 3Q Max
## -850.03 -219.11 -40.51 180.62 2221.72
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -398.461 104.111 -3.827 0.000138 ***
## KWW 8.498 1.742 4.878 1.26e-06 ***
## educ 58.215 6.712 8.673 < 2e-16 ***
## exper 10.822 3.187 3.396 0.000713 ***
## tenure 6.836 2.412 2.835 0.004688 **
## marriedUnmarried -168.668 38.424 -4.390 1.27e-05 ***
## urbanUrban 156.100 26.251 5.946 3.88e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 358.5 on 928 degrees of freedom
## Multiple R-squared: 0.2192, Adjusted R-squared: 0.2141
## F-statistic: 43.42 on 6 and 928 DF, p-value: < 2.2e-16
Since the p-value is less than 0.05, we can reject the null hypothesis and so there is a significant relationship between the variables in the linear regression model.
library(leaps)
leap<-regsubsets(model1,data=wages,nbest=1)
summary(leap)
## Subset selection object
## Call: regsubsets.formula(model1, data = wages, nbest = 1)
## 6 Variables (and intercept)
## Forced in Forced out
## KWW FALSE FALSE
## educ FALSE FALSE
## exper FALSE FALSE
## tenure FALSE FALSE
## marriedUnmarried FALSE FALSE
## urbanUrban FALSE FALSE
## 1 subsets of each size up to 6
## Selection Algorithm: exhaustive
## KWW educ exper tenure marriedUnmarried urbanUrban
## 1 ( 1 ) " " "*" " " " " " " " "
## 2 ( 1 ) "*" "*" " " " " " " " "
## 3 ( 1 ) "*" "*" " " " " " " "*"
## 4 ( 1 ) "*" "*" " " " " "*" "*"
## 5 ( 1 ) "*" "*" "*" " " "*" "*"
## 6 ( 1 ) "*" "*" "*" "*" "*" "*"
plot(leap, scale = "adjr2")
model2<-wage~married+urban
fit2<-lm(model2, data = wages)
summary(fit2)
##
## Call:
## lm(formula = model2, data = wages)
##
## Residuals:
## Min 1Q Median 3Q Max
## -914.97 -263.61 -49.61 185.71 2048.03
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 846.61 24.42 34.668 < 2e-16 ***
## marriedUnmarried -189.36 41.56 -4.557 5.89e-06 ***
## urbanUrban 183.36 28.53 6.427 2.08e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 392.4 on 932 degrees of freedom
## Multiple R-squared: 0.0603, Adjusted R-squared: 0.05828
## F-statistic: 29.9 on 2 and 932 DF, p-value: 2.589e-13
library(leaps)
leap<-regsubsets(model2,data=wages,nbest=1)
summary(leap)
## Subset selection object
## Call: regsubsets.formula(model2, data = wages, nbest = 1)
## 2 Variables (and intercept)
## Forced in Forced out
## marriedUnmarried FALSE FALSE
## urbanUrban FALSE FALSE
## 1 subsets of each size up to 2
## Selection Algorithm: exhaustive
## marriedUnmarried urbanUrban
## 1 ( 1 ) " " "*"
## 2 ( 1 ) "*" "*"
plot(leap, scale = "adjr2")
library(coefplot)
## Warning: package 'coefplot' was built under R version 3.4.3
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
coefplot(fit1,intercept=FALSE)
library(coefplot)
coefplot(fit2,intercept=FALSE)
summary(fit1)$adj.r.squared
## [1] 0.21413
summary(fit2)$adj.r.squared
## [1] 0.05828247
AIC(fit1)
## [1] 13661.4
AIC(fit2)
## [1] 13826.58
Since the p-value is lesser, adjusted R squared value is higher, and the AIC value lesser, for model1, model1 is our ‘best’ ordinary least squares model. Model 1 predicts the “wage” as a function of the following independent variables(or regressors) : {KWW, educ, exper, tenure, married ,urban}.
Using OLS regression model, we formulate a regression model for wage as a dependent variable, depending on the independent variables KWW, Years of education, Work Experience,and Tenure. Surprisingly, in this model wage is independent of number of hours worked, IQ, and age.