library(foreign)
data<-read.dta("morg96.dta")
Subset data with age. I use variable “age”. Details about this variable is on CPS Labor Extracts, page 20. The person’s age should “>=25&<=55”.
data<-subset(data, age>=25&age<=55)
Have a quick view
summary(data$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 25.00 32.00 39.00 39.27 46.00 55.00
Subset data with work hours last week. I use variable “hourslw”. Details about this variable is on CPS Labor Extracts, page 41. The person should work at least 35 hours during his last week of work.
data<-subset(data, hourslw>=35)
Have a quick view
summary(data$hourslw)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 35.00 40.00 40.00 45.61 50.00 99.00
Subset data with class of worker. I use variable “class94”. Details about this variable is on CPS Labor Extracts, page 36. The person should work at least 35 hours during his last week of work.
data<-subset(data, class94!=6&class94!=7)
Have a quick view
table(data$class94)
##
## 1 2 3 4 5 8
## 3168 4357 7697 67153 4556 74
Save the cleaned data as “dataC”
dataC<-data
Based on general knowledge, the Coding method is:
data.frame(Grade92=31:46,SchoolYears=c(0,2.5,5.5,7.5,9,10,11,12,15,19,19,19,19,21,21,24))
## Grade92 SchoolYears
## 1 31 0.0
## 2 32 2.5
## 3 33 5.5
## 4 34 7.5
## 5 35 9.0
## 6 36 10.0
## 7 37 11.0
## 8 38 12.0
## 9 39 15.0
## 10 40 19.0
## 11 41 19.0
## 12 42 19.0
## 13 43 19.0
## 14 44 21.0
## 15 45 21.0
## 16 46 24.0
Extract “grade92” as a factor. Map the person’s education level to years of schooling completed. Use the new variable to replace “grade92”
temp<-factor(data$grade92)
temp<-factor(temp, labels=c(0,2.5,5.5,7.5,9,10,11,12,15,19,19,19,19,21,21,24))
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
data$grade92<-as.numeric(as.vector(temp))
These four variables are “earnwke”, “grade92”, “sex” and “age”, respectively on page 33,25,18,20 in CPS Labor Extracts.
Delete those who are not in private sector or government. Here I only consider private for profit.
data<-subset(data,class94%in%1:4)
Group the data by class. Calculte min, max, mean, sd values.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
Class<-group_by(data, class94)
dataN<-as.data.frame(summarize(Class,
earn.mean=mean(earnwke),
earn.sd=sd(earnwke),
earn.min=min(earnwke),
earn.max=max(earnwke),
eduyears.mean=mean(grade92),
eduyears.sd=sd(grade92),
eduyears.min=min(grade92),
eduyears.max=max(grade92),
sex.mean=mean(sex),
sex.sd=sd(sex),
sex.min=min(sex),
sex.max=max(sex),
age.mean=mean(age),
age.sd=sd(age),
age.min=min(age),
age.max=max(age)
))
Reshape it
c<-c(2,6,10,14)
dataMSMM<-data.frame(
item=rep(c("weekly earning", "schooling years","gender","age"),each=4),
Class=rep(c("Federal Government","State Government","Local Government","Private, for profit"), 4) ,
mean=as.vector(as.matrix(dataN[,c])) ,
sd=as.vector(as.matrix(dataN[,c+1])),
min=as.vector(as.matrix(dataN[,c+2])),
max=as.vector(as.matrix(dataN[,c+3]))
)
dataMSMM
## item Class mean sd min max
## 1 weekly earning Federal Government 725.593119 349.4450671 0.0 1923
## 2 weekly earning State Government 619.835667 321.8581740 0.0 1923
## 3 weekly earning Local Government 633.316487 330.2363949 0.0 1923
## 4 weekly earning Private, for profit 605.645094 374.6180955 0.0 1923
## 5 schooling years Federal Government 18.061711 2.4379697 2.5 24
## 6 schooling years State Government 18.552100 2.8135419 2.5 24
## 7 schooling years Local Government 18.227101 2.7241992 0.0 24
## 8 schooling years Private, for profit 16.681288 3.4650666 0.0 24
## 9 gender Federal Government 1.433081 0.4955798 1.0 2
## 10 gender State Government 1.516181 0.4997955 1.0 2
## 11 gender Local Government 1.537742 0.4986059 1.0 2
## 12 gender Private, for profit 1.401501 0.4902056 1.0 2
## 13 age Federal Government 41.551768 7.8023725 25.0 55
## 14 age State Government 40.785632 8.1583809 25.0 55
## 15 age Local Government 40.846823 8.2978469 25.0 55
## 16 age Private, for profit 38.374280 8.3162998 25.0 55
Load the data “housing_supply.dta”
library(readstata13)
dataH<-read.dta13("housing_supply.dta")
Merge. Drop those without “unaval”
dataM <- merge(x=dataH,y=dataC, by.x='msafips', by.y='msafips')
head(data,1)
## hurespli intmonth minsamp hrsample serial hhnum state stfips county
## 9 1 1 4 A65 -1 1 63 1 0
## centcity msafips cmsacode smsastat smsa93 icntcity hhid sex
## 9 1 5240 NA 1 3 NA 199270459680970 1
## veteran grade92 famnum selfproxy why3594 class94 unioncov studftpt
## 9 6 15 0 1 NA 4 2 NA
## relref95 age pfamrel marital race hourslw reason94 absent94 dwrsn ind80
## 9 2 37 0 5 1 41 NA NA NA 282
## occ80 uhourse laydur paidhre unionmme ethnic lfsr94 untype ftpt94 agri
## 9 22 40 NA 1 2 8 1 NA 2 0
## eligible otc earnhre earnwke schenr schlvl weight earnwt I25a I25b
## 9 1 1 910 446 NA NA 2790.368 10966.5 0 0
## I25c I25d penatvty pemntvty pefntvty prcitshp prcitflg peinusyr hrlonglk
## 9 0 0 57 57 57 1 0 0 2
## lineno recnum year ym_file ym docc80 dind
## 9 1 18 1996 432 429 2 9
Only keep variables “earnwke” and “class94”.
dataEG<-data.frame(income=dataC$earnwke,class=dataC$class94)
Only keep those work in private sector for profits or local government.
dataEG<-subset(dataEG,class==4|class==3)
Code their class: class=1 if she’s in private sector; class=0 if she’s in local government
dataEG$class<-dataEG$class-3
Delete those with 0 income
dataEG<-subset(dataEG,income>0)
dataEG$income<-log(dataEG$income)
colnames(dataEG)[1]<-"LEW"
fit<-lm( LEW ~ class, data=dataEG )
library(xtable)
xt<-xtable(summary(fit))
Here is the regression result.
print(xt, type='html')
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 6.3136 | 0.0067 | 938.42 | 0.0000 |
| class | -0.0757 | 0.0071 | -10.66 | 0.0000 |
Plot the result
library(ggplot2)
g<-ggplot(dataEG, aes(class, LEW))
g+geom_point(color="steelblue",size=1.5 )+geom_smooth(method = 'lm')+labs(title = "ln(weekly earnings) on Class") + labs(x = 'Class (private =1, local gov =0)', y = "ln(weekly earning)")
We can clonclude that local government workers have higher wage.
Delete outliers who: ln(weekly earnings)<3
dataEG2<-subset(dataEG,LEW>=3)
Do new regression
fit2<-lm( LEW ~ class, data=dataEG2 )
Put these two fit line together. We can see that in the plot graph, new fit lines overlapped the old.
plot(dataEG$class, dataEG$LEW, col=rgb(0,0,0.8,0.3))
abline(fit$coefficients[1], fit$coefficients[2],col=rgb(0,0,1,0.3), lwd=3, main='with outlier')
abline(fit2$coefficients[1], fit2$coefficients[2],col=rgb(1,0,0,0.3), lwd=3, main ='without outlier')
Since new fit line overlapped the old, the outliers do not make much difference.
layout(matrix(c(1,2,3,4),2,2))
plot(fit)
myplclust<- function(hclust, lab = hclust$labels, lab.col = rep(1, length(hclust$labels)), hang = 0.1, ...){
y <- rep(hclust$height, 2)
x <- as.numeric(hclust$merge)
y <- y[which(x < 0)]
x <- x[which(x < 0)]
x <- abs(x)
y <- y[order(x)]
x <- x[order(x)]
plot(hclust, labels = FALSE, hang = hang, ...)
text(x = x, y = y[hclust$order] - (max(hclust$height) * hang), labels = lab[hclust$order], col = lab.col[hclust$order], srt = 90, adj = c(1, 0.5), xpd = NA, ...)
}
dataEGS<-dataEG[sample(nrow(dataEG),400),]
distxy <- dist(dataEGS[,1])
hClustering <- hclust(distxy)
myplclust(hClustering, lab = dataEGS[,2], lab.col = dataEGS[,2]+2 )
Cannot find a clear pattern between the wages of private sector worker and local government worker