C1
setwd("/Users/vancam/Documents/WAIKATO-Thesis/Rworking/Wooldridge/StataFile")
rm(list = ls())
library(foreign)
wage1 <- read.dta("WAGE1.DTA")
## Warning in read.dta("WAGE1.DTA"): cannot read factor labels from Stata 5
## files
str(wage1)
## 'data.frame': 526 obs. of 24 variables:
## $ wage : num 3.1 3.24 3 6 5.3 ...
## $ educ : int 11 12 11 8 12 16 18 12 12 17 ...
## $ exper : int 2 22 2 44 7 9 15 5 26 22 ...
## $ tenure : int 0 2 0 28 2 8 7 3 4 21 ...
## $ nonwhite: int 0 0 0 0 0 0 0 0 0 0 ...
## $ female : int 1 1 0 0 0 0 0 1 1 0 ...
## $ married : int 0 1 0 1 1 1 0 0 0 1 ...
## $ numdep : int 2 3 2 0 1 0 0 0 2 0 ...
## $ smsa : int 1 1 0 1 0 1 1 1 1 1 ...
## $ northcen: int 0 0 0 0 0 0 0 0 0 0 ...
## $ south : int 0 0 0 0 0 0 0 0 0 0 ...
## $ west : int 1 1 1 1 1 1 1 1 1 1 ...
## $ construc: int 0 0 0 0 0 0 0 0 0 0 ...
## $ ndurman : int 0 0 0 0 0 0 0 0 0 0 ...
## $ trcommpu: int 0 0 0 0 0 0 0 0 0 0 ...
## $ trade : int 0 0 1 0 0 0 1 0 1 0 ...
## $ services: int 0 1 0 0 0 0 0 0 0 0 ...
## $ profserv: int 0 0 0 0 0 1 0 0 0 0 ...
## $ profocc : int 0 0 0 0 0 1 1 1 1 1 ...
## $ clerocc : int 0 0 0 1 0 0 0 0 0 0 ...
## $ servocc : int 0 1 0 0 0 0 0 0 0 0 ...
## $ lwage : num 1.13 1.18 1.1 1.79 1.67 ...
## $ expersq : int 4 484 4 1936 49 81 225 25 676 484 ...
## $ tenursq : int 0 4 0 784 4 64 49 9 16 441 ...
## - attr(*, "datalabel")= chr ""
## - attr(*, "time.stamp")= chr "16 Sep 1996 15:52"
## - attr(*, "formats")= chr "%8.2g" "%8.0g" "%8.0g" "%8.0g" ...
## - attr(*, "types")= int 102 98 98 98 98 98 98 98 98 98 ...
## - attr(*, "val.labels")= chr "" "" "" "" ...
## - attr(*, "var.labels")= chr "average hourly earnings" "years of education" "years potential experience" "years with current employer" ...
## - attr(*, "version")= int 5
summary(wage1)# Find the mean, median, min, max of variables
## wage educ exper tenure
## Min. : 0.530 Min. : 0.00 Min. : 1.00 Min. : 0.000
## 1st Qu.: 3.330 1st Qu.:12.00 1st Qu.: 5.00 1st Qu.: 0.000
## Median : 4.650 Median :12.00 Median :13.50 Median : 2.000
## Mean : 5.896 Mean :12.56 Mean :17.02 Mean : 5.105
## 3rd Qu.: 6.880 3rd Qu.:14.00 3rd Qu.:26.00 3rd Qu.: 7.000
## Max. :24.980 Max. :18.00 Max. :51.00 Max. :44.000
## nonwhite female married numdep
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.000
## Median :0.0000 Median :0.0000 Median :1.0000 Median :1.000
## Mean :0.1027 Mean :0.4791 Mean :0.6084 Mean :1.044
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:2.000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :6.000
## smsa northcen south west
## Min. :0.0000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.0000 Median :0.000 Median :0.0000 Median :0.0000
## Mean :0.7224 Mean :0.251 Mean :0.3555 Mean :0.1692
## 3rd Qu.:1.0000 3rd Qu.:0.750 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.000 Max. :1.0000 Max. :1.0000
## construc ndurman trcommpu trade
## Min. :0.00000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.00000 Median :0.0000
## Mean :0.04563 Mean :0.1141 Mean :0.04373 Mean :0.2871
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## services profserv profocc clerocc
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.0000 Median :0.0000 Median :0.0000 Median :0.0000
## Mean :0.1008 Mean :0.2586 Mean :0.3669 Mean :0.1673
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## servocc lwage expersq tenursq
## Min. :0.0000 Min. :-0.6349 Min. : 1.0 Min. : 0.00
## 1st Qu.:0.0000 1st Qu.: 1.2030 1st Qu.: 25.0 1st Qu.: 0.00
## Median :0.0000 Median : 1.5369 Median : 182.5 Median : 4.00
## Mean :0.1407 Mean : 1.6233 Mean : 473.4 Mean : 78.15
## 3rd Qu.:0.0000 3rd Qu.: 1.9286 3rd Qu.: 676.0 3rd Qu.: 49.00
## Max. :1.0000 Max. : 3.2181 Max. :2601.0 Max. :1936.00
sum(wage1$female)# Number of female in the sample
## [1] 252
526-252 # Number of male in the sample
## [1] 274
C2
bwght <- read.dta("BWGHT.DTA")
## Warning in read.dta("BWGHT.DTA"): cannot read factor labels from Stata 5
## files
str(bwght) # To understand about the structure of the data
## 'data.frame': 1388 obs. of 14 variables:
## $ faminc : num 13.5 7.5 0.5 15.5 27.5 7.5 65 27.5 27.5 37.5 ...
## $ cigtax : num 16.5 16.5 16.5 16.5 16.5 16.5 16.5 16.5 16.5 16.5 ...
## $ cigprice: num 122 122 122 122 122 ...
## $ bwght : int 109 133 129 126 134 118 140 86 121 129 ...
## $ fatheduc: int 12 6 NA 12 14 12 16 12 12 16 ...
## $ motheduc: int 12 12 12 12 12 14 14 14 17 18 ...
## $ parity : int 1 2 2 2 2 6 2 2 2 2 ...
## $ male : int 1 1 0 1 1 1 0 0 0 0 ...
## $ white : int 1 0 0 0 1 0 1 0 1 1 ...
## $ cigs : int 0 0 0 0 0 0 0 0 0 0 ...
## $ lbwght : num 4.69 4.89 4.86 4.84 4.9 ...
## $ bwghtlbs: num 6.81 8.31 8.06 7.88 8.38 ...
## $ packs : num 0 0 0 0 0 0 0 0 0 0 ...
## $ lfaminc : num 2.603 2.015 -0.693 2.741 3.314 ...
## - attr(*, "datalabel")= chr ""
## - attr(*, "time.stamp")= chr " 3 Jun 1997 13:47"
## - attr(*, "formats")= chr "%9.0g" "%9.0g" "%9.0g" "%8.0g" ...
## - attr(*, "types")= int 102 102 102 105 98 98 98 98 98 98 ...
## - attr(*, "val.labels")= chr "" "" "" "" ...
## - attr(*, "var.labels")= chr "1988 family income, $1000s" "cig. tax in home state, 1988" "cig. price in home state, 1988" "birth weight, ounces" ...
## - attr(*, "version")= int 5
head(bwght)
## faminc cigtax cigprice bwght fatheduc motheduc parity male white cigs
## 1 13.5 16.5 122.3 109 12 12 1 1 1 0
## 2 7.5 16.5 122.3 133 6 12 2 1 0 0
## 3 0.5 16.5 122.3 129 NA 12 2 0 0 0
## 4 15.5 16.5 122.3 126 12 12 2 1 0 0
## 5 27.5 16.5 122.3 134 14 12 2 1 1 0
## 6 7.5 16.5 122.3 118 12 14 6 1 0 0
## lbwght bwghtlbs packs lfaminc
## 1 4.691348 6.8125 0 2.6026897
## 2 4.890349 8.3125 0 2.0149031
## 3 4.859812 8.0625 0 -0.6931472
## 4 4.836282 7.8750 0 2.7408400
## 5 4.897840 8.3750 0 3.3141861
## 6 4.770685 7.3750 0 2.0149031
sum(bwght$male==1)# Count the number of male in the sample
## [1] 723
sum(bwght$male == 0)
## [1] 665
1388-723 # Calculate the number of female in the sample.
## [1] 665
sum(bwght$male == 0 & bwght$cigs > 0) # Count the number of female with smoking
## [1] 112
summary(bwght) # Find the mean, median, min, max of the variables
## faminc cigtax cigprice bwght
## Min. : 0.50 Min. : 2.00 Min. :103.8 Min. : 23.0
## 1st Qu.:14.50 1st Qu.:15.00 1st Qu.:122.8 1st Qu.:107.0
## Median :27.50 Median :20.00 Median :130.8 Median :120.0
## Mean :29.03 Mean :19.55 Mean :130.6 Mean :118.7
## 3rd Qu.:37.50 3rd Qu.:26.00 3rd Qu.:137.0 3rd Qu.:132.0
## Max. :65.00 Max. :38.00 Max. :152.5 Max. :271.0
##
## fatheduc motheduc parity male
## Min. : 1.00 Min. : 2.00 Min. :1.000 Min. :0.0000
## 1st Qu.:12.00 1st Qu.:12.00 1st Qu.:1.000 1st Qu.:0.0000
## Median :12.00 Median :12.00 Median :1.000 Median :1.0000
## Mean :13.19 Mean :12.94 Mean :1.633 Mean :0.5209
## 3rd Qu.:16.00 3rd Qu.:14.00 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :18.00 Max. :18.00 Max. :6.000 Max. :1.0000
## NA's :196 NA's :1
## white cigs lbwght bwghtlbs
## Min. :0.0000 Min. : 0.000 Min. :3.135 Min. : 1.438
## 1st Qu.:1.0000 1st Qu.: 0.000 1st Qu.:4.673 1st Qu.: 6.688
## Median :1.0000 Median : 0.000 Median :4.787 Median : 7.500
## Mean :0.7846 Mean : 2.087 Mean :4.760 Mean : 7.419
## 3rd Qu.:1.0000 3rd Qu.: 0.000 3rd Qu.:4.883 3rd Qu.: 8.250
## Max. :1.0000 Max. :50.000 Max. :5.602 Max. :16.938
##
## packs lfaminc
## Min. :0.0000 Min. :-0.6931
## 1st Qu.:0.0000 1st Qu.: 2.6741
## Median :0.0000 Median : 3.3142
## Mean :0.1044 Mean : 3.0713
## 3rd Qu.:0.0000 3rd Qu.: 3.6243
## Max. :2.5000 Max. : 4.1744
##
# There is not information have been found about female prenancy. Which is that variable?
# Report about average income and its standard deviation
sd(bwght$faminc) # Mean of average income is 29.03; Standard deviation of family income is 18.73 dollars
## [1] 18.73928
C3
meap01 <- read.dta("MEAP01.DTA")
summary(meap01$math4)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 61.60 76.40 71.91 87.00 100.00
#1The largest value of math4 is 100, smallest is 0
str(meap01)
## 'data.frame': 1823 obs. of 11 variables:
## $ dcode : num 1010 2070 2080 3010 3010 3010 3020 3020 3020 3030 ...
## $ bcode : int 4937 597 4860 790 1403 4056 922 2864 4851 881 ...
## $ math4 : num 83.3 90.3 61.9 85.7 77.3 ...
## $ read4 : num 77.8 82.3 71.4 60 59.1 ...
## $ lunch : num 40.6 27.1 41.8 12.8 17.1 ...
## $ enroll : int 468 679 400 251 439 561 442 381 274 326 ...
## $ expend : num 2747475 1505772 2121871 1211034 1913501 ...
## $ exppp : num 5871 2218 5305 4825 4359 ...
## $ lenroll: num 6.15 6.52 5.99 5.53 6.08 ...
## $ lexpend: num 14.8 14.2 14.6 14 14.5 ...
## $ lexppp : num 8.68 7.7 8.58 8.48 8.38 ...
## - attr(*, "datalabel")= chr ""
## - attr(*, "time.stamp")= chr "22 Dec 2005 10:35"
## - attr(*, "formats")= chr "%9.0g" "%9.0g" "%9.0g" "%9.0g" ...
## - attr(*, "types")= int 102 105 102 102 102 105 102 102 102 102 ...
## - attr(*, "val.labels")= chr "" "" "" "" ...
## - attr(*, "var.labels")= chr "district code" "building code" "% students satisfactory, 4th grade math" "% students satisfactory, 4th grade reading" ...
## - attr(*, "version")= int 7
#2
sum(meap01$math4==100) # There are 38 obs with math4 is 100
## [1] 38
38/1823 # The percentage of school with math4 is 100 in total sample
## [1] 0.02084476
#3 Dont understand the question
#4 Compare the average pass rates for math and reading scores
summary(meap01$math4)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 61.60 76.40 71.91 87.00 100.00
summary(meap01$read4)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 48.90 62.70 60.06 73.90 100.00
#5. Correlation between math4 and read4
math4 <- meap01$math4
read4 <- meap01$read4
cor(math4, read4)
## [1] 0.8427281
cor(meap01$math4, meap01$read4) # Highly correlated between math4 and read4
## [1] 0.8427281
#6. Average and standard deviation of "expp"
summary(meap01$exppp)# Mean
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1207 4502 5078 5195 5767 11960
sd(meap01$exppp) # Standard deviation
## [1] 1091.89
#7. Dont understand the question
C4
jtrain2 <- read.dta("JTRAIN2.DTA")
## Warning in read.dta("JTRAIN2.DTA"): cannot read factor labels from Stata 5
## files
C5
fertil2 <- read.dta("FERTIL2.DTA")
## Warning in read.dta("FERTIL2.DTA"): cannot read factor labels from Stata 5
## files
#1
summary(fertil2$children)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 0.000 2.000 2.268 4.000 13.000
#2 The number of women with electricity == 1
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
library()
sum(fertil2$electric ==1, na.rm = TRUE) # There is 611 obs with electricity. The percentage is 611/4361 and it is equal to
## [1] 611
(611/4361)*100 # 14.01%
## [1] 14.01055
#3 Not done yet
sum(fertil2$children>0 & fertil2$electric == 1, na.rm = TRUE)
## [1] 450
sum(fertil2$children == 0)
## [1] 1132
sum(fertil2$children > 0)
## [1] 3229
sum(fertil2$chileren > 0 & fertil2$electric == 0, na.rm = TRUE)
## [1] 0