Task One: mean, sd, min & max

Step 1: Load the data “morg96.dta”

library(foreign)
data<-read.dta("morg96.dta")

Step 2: Subset the data

Subset data with age. I use variable “age”. Details about this variable is on CPS Labor Extracts, page 20. The person’s age should “>=25&<=55”.

data<-subset(data, age>=25&age<=55)

Have a quick view

summary(data$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   25.00   32.00   39.00   39.27   46.00   55.00

Subset data with work hours last week. I use variable “hourslw”. Details about this variable is on CPS Labor Extracts, page 41. The person should work at least 35 hours during his last week of work.

data<-subset(data, hourslw>=35)

Have a quick view

summary(data$hourslw)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   35.00   40.00   40.00   45.61   50.00   99.00

Subset data with class of worker. I use variable “class94”. Details about this variable is on CPS Labor Extracts, page 36. The person should work at least 35 hours during his last week of work.

data<-subset(data, class94!=6&class94!=7) 

Have a quick view

table(data$class94)
## 
##     1     2     3     4     5     8 
##  3168  4357  7697 67153  4556    74

Save the cleaned data as “dataC”

dataC<-data

Step 3: Code the number of years of schooling.

Based on general knowledge, the Coding method is:

data.frame(Grade92=31:46,SchoolYears=c(0,2.5,5.5,7.5,9,10,11,12,15,19,19,19,19,21,21,24))
##    Grade92 SchoolYears
## 1       31         0.0
## 2       32         2.5
## 3       33         5.5
## 4       34         7.5
## 5       35         9.0
## 6       36        10.0
## 7       37        11.0
## 8       38        12.0
## 9       39        15.0
## 10      40        19.0
## 11      41        19.0
## 12      42        19.0
## 13      43        19.0
## 14      44        21.0
## 15      45        21.0
## 16      46        24.0

Extract “grade92” as a factor. Map the person’s education level to years of schooling completed. Use the new variable to replace “grade92”

temp<-factor(data$grade92)
temp<-factor(temp, labels=c(0,2.5,5.5,7.5,9,10,11,12,15,19,19,19,19,21,21,24)) 
## Warning in `levels<-`(`*tmp*`, value = if (nl == nL) as.character(labels)
## else paste0(labels, : duplicated levels in factors are deprecated
data$grade92<-as.numeric(as.vector(temp)) 

Step 4: Group the new data, and calculte mean, srandard, min, max values.

These four variables are “earnwke”, “grade92”, “sex” and “age”, respectively on page 33,25,18,20 in CPS Labor Extracts.
Delete those who are not in private sector or government. Here I only consider private for profit.

data<-subset(data,class94%in%1:4)

Group the data by class. Calculte min, max, mean, sd values.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
Class<-group_by(data, class94)
dataN<-as.data.frame(summarize(Class, 
    earn.mean=mean(earnwke),
    earn.sd=sd(earnwke),
    earn.min=min(earnwke),
    earn.max=max(earnwke),
    eduyears.mean=mean(grade92),
    eduyears.sd=sd(grade92),
    eduyears.min=min(grade92),
    eduyears.max=max(grade92),
    sex.mean=mean(sex),
    sex.sd=sd(sex),
    sex.min=min(sex),
    sex.max=max(sex),
    age.mean=mean(age),
    age.sd=sd(age),
    age.min=min(age),
    age.max=max(age)
))

Reshape it

c<-c(2,6,10,14)
dataMSMM<-data.frame(
    item=rep(c("weekly earning", "schooling years","gender","age"),each=4), 
  Class=rep(c("Federal Government","State Government","Local Government","Private, for profit"), 4) ,
    mean=as.vector(as.matrix(dataN[,c])) ,
    sd=as.vector(as.matrix(dataN[,c+1])),
    min=as.vector(as.matrix(dataN[,c+2])),
    max=as.vector(as.matrix(dataN[,c+3]))
)

Step 5: Report mean, sd, min & max values

dataMSMM
##               item               Class       mean          sd  min  max
## 1   weekly earning  Federal Government 725.593119 349.4450671  0.0 1923
## 2   weekly earning    State Government 619.835667 321.8581740  0.0 1923
## 3   weekly earning    Local Government 633.316487 330.2363949  0.0 1923
## 4   weekly earning Private, for profit 605.645094 374.6180955  0.0 1923
## 5  schooling years  Federal Government  18.061711   2.4379697  2.5   24
## 6  schooling years    State Government  18.552100   2.8135419  2.5   24
## 7  schooling years    Local Government  18.227101   2.7241992  0.0   24
## 8  schooling years Private, for profit  16.681288   3.4650666  0.0   24
## 9           gender  Federal Government   1.433081   0.4955798  1.0    2
## 10          gender    State Government   1.516181   0.4997955  1.0    2
## 11          gender    Local Government   1.537742   0.4986059  1.0    2
## 12          gender Private, for profit   1.401501   0.4902056  1.0    2
## 13             age  Federal Government  41.551768   7.8023725 25.0   55
## 14             age    State Government  40.785632   8.1583809 25.0   55
## 15             age    Local Government  40.846823   8.2978469 25.0   55
## 16             age Private, for profit  38.374280   8.3162998 25.0   55

Task Two: merge data

Load the data “housing_supply.dta”

library(readstata13)
dataH<-read.dta13("housing_supply.dta")

Merge. Drop those without “unaval”

dataM <- merge(x=dataH,y=dataC, by.x='msafips', by.y='msafips')
head(data,1)
##   hurespli intmonth minsamp hrsample serial hhnum state stfips county
## 9        1        1       4      A65     -1     1    63      1      0
##   centcity msafips cmsacode smsastat smsa93 icntcity            hhid sex
## 9        1    5240       NA        1      3       NA 199270459680970   1
##   veteran grade92 famnum selfproxy why3594 class94 unioncov studftpt
## 9       6      15      0         1      NA       4        2       NA
##   relref95 age pfamrel marital race hourslw reason94 absent94 dwrsn ind80
## 9        2  37       0       5    1      41       NA       NA    NA   282
##   occ80 uhourse laydur paidhre unionmme ethnic lfsr94 untype ftpt94 agri
## 9    22      40     NA       1        2      8      1     NA      2    0
##   eligible otc earnhre earnwke schenr schlvl   weight  earnwt I25a I25b
## 9        1   1     910     446     NA     NA 2790.368 10966.5    0    0
##   I25c I25d penatvty pemntvty pefntvty prcitshp prcitflg peinusyr hrlonglk
## 9    0    0       57       57       57        1        0        0        2
##   lineno recnum year ym_file  ym docc80 dind
## 9      1     18 1996     432 429      2    9

Task Three: regression

Step 1: Subset “dataC”, the cleaned data in Task One.

Only keep variables “earnwke” and “class94”.

dataEG<-data.frame(income=dataC$earnwke,class=dataC$class94)

Only keep those work in private sector for profits or local government.

dataEG<-subset(dataEG,class==4|class==3)

Code their class: class=1 if she’s in private sector; class=0 if she’s in local government

dataEG$class<-dataEG$class-3 

Delete those with 0 income

dataEG<-subset(dataEG,income>0)

Step 2: Create the new variable for ln(weekly earnings). Name it as “LEW”.

dataEG$income<-log(dataEG$income)
colnames(dataEG)[1]<-"LEW"

Step 3: Regress ln(weekly earnings) on class

fit<-lm( LEW ~ class, data=dataEG )
library(xtable)
xt<-xtable(summary(fit))

Here is the regression result.

print(xt, type='html')
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6.3136 0.0067 938.42 0.0000
class -0.0757 0.0071 -10.66 0.0000

Plot the result

library(ggplot2)
g<-ggplot(dataEG, aes(class, LEW))
g+geom_point(color="steelblue",size=1.5 )+geom_smooth(method = 'lm')+labs(title = "ln(weekly earnings) on Class") + labs(x = 'Class (private =1, local gov =0)', y = "ln(weekly earning)")

We can clonclude that local government workers have higher wage.

Some other questions

Q.1: Do the outliers matter?

Delete outliers who: ln(weekly earnings)<3

dataEG2<-subset(dataEG,LEW>=3)

Do new regression

fit2<-lm( LEW ~ class, data=dataEG2 )

Put these two fit line together. We can see that in the plot graph, new fit lines overlapped the old.

plot(dataEG$class, dataEG$LEW, col=rgb(0,0,0.8,0.3))
abline(fit$coefficients[1], fit$coefficients[2],col=rgb(0,0,1,0.3), lwd=3, main='with outlier')
abline(fit2$coefficients[1], fit2$coefficients[2],col=rgb(1,0,0,0.3), lwd=3, main ='without outlier')

Since new fit line overlapped the old, the outliers do not make much difference.

Q.2: Dignostic plots

layout(matrix(c(1,2,3,4),2,2))
plot(fit)

Q.3: Hierarchical clustering

myplclust<- function(hclust, lab = hclust$labels, lab.col = rep(1, length(hclust$labels)), hang = 0.1, ...){
    y <- rep(hclust$height, 2) 
    x <- as.numeric(hclust$merge) 
    y <- y[which(x < 0)] 
    x <- x[which(x < 0)]
    x <- abs(x)
    y <- y[order(x)]
    x <- x[order(x)]
    plot(hclust, labels = FALSE, hang = hang, ...)
    text(x = x, y = y[hclust$order] - (max(hclust$height) * hang), labels = lab[hclust$order], col = lab.col[hclust$order], srt = 90, adj = c(1, 0.5), xpd = NA, ...)
}
dataEGS<-dataEG[sample(nrow(dataEG),400),]
distxy <- dist(dataEGS[,1])
hClustering <- hclust(distxy)

myplclust(hClustering, lab = dataEGS[,2], lab.col = dataEGS[,2]+2 )

Cannot find a clear pattern between the wages of private sector worker and local government worker