Chapter 1

C1

setwd("/Users/vancam/Documents/WAIKATO-Thesis/Rworking/Wooldridge/StataFile")
rm(list = ls())
library(foreign)
wage1 <- read.dta("WAGE1.DTA")
## Warning in read.dta("WAGE1.DTA"): cannot read factor labels from Stata 5
## files
str(wage1)
## 'data.frame':    526 obs. of  24 variables:
##  $ wage    : num  3.1 3.24 3 6 5.3 ...
##  $ educ    : int  11 12 11 8 12 16 18 12 12 17 ...
##  $ exper   : int  2 22 2 44 7 9 15 5 26 22 ...
##  $ tenure  : int  0 2 0 28 2 8 7 3 4 21 ...
##  $ nonwhite: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ female  : int  1 1 0 0 0 0 0 1 1 0 ...
##  $ married : int  0 1 0 1 1 1 0 0 0 1 ...
##  $ numdep  : int  2 3 2 0 1 0 0 0 2 0 ...
##  $ smsa    : int  1 1 0 1 0 1 1 1 1 1 ...
##  $ northcen: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ south   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ west    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ construc: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ndurman : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ trcommpu: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ trade   : int  0 0 1 0 0 0 1 0 1 0 ...
##  $ services: int  0 1 0 0 0 0 0 0 0 0 ...
##  $ profserv: int  0 0 0 0 0 1 0 0 0 0 ...
##  $ profocc : int  0 0 0 0 0 1 1 1 1 1 ...
##  $ clerocc : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ servocc : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ lwage   : num  1.13 1.18 1.1 1.79 1.67 ...
##  $ expersq : int  4 484 4 1936 49 81 225 25 676 484 ...
##  $ tenursq : int  0 4 0 784 4 64 49 9 16 441 ...
##  - attr(*, "datalabel")= chr ""
##  - attr(*, "time.stamp")= chr "16 Sep 1996 15:52"
##  - attr(*, "formats")= chr  "%8.2g" "%8.0g" "%8.0g" "%8.0g" ...
##  - attr(*, "types")= int  102 98 98 98 98 98 98 98 98 98 ...
##  - attr(*, "val.labels")= chr  "" "" "" "" ...
##  - attr(*, "var.labels")= chr  "average hourly earnings" "years of education" "years potential experience" "years with current employer" ...
##  - attr(*, "version")= int 5
summary(wage1)# Find the mean, median, min, max of variables
##       wage             educ           exper           tenure      
##  Min.   : 0.530   Min.   : 0.00   Min.   : 1.00   Min.   : 0.000  
##  1st Qu.: 3.330   1st Qu.:12.00   1st Qu.: 5.00   1st Qu.: 0.000  
##  Median : 4.650   Median :12.00   Median :13.50   Median : 2.000  
##  Mean   : 5.896   Mean   :12.56   Mean   :17.02   Mean   : 5.105  
##  3rd Qu.: 6.880   3rd Qu.:14.00   3rd Qu.:26.00   3rd Qu.: 7.000  
##  Max.   :24.980   Max.   :18.00   Max.   :51.00   Max.   :44.000  
##     nonwhite          female          married           numdep     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.000  
##  Median :0.0000   Median :0.0000   Median :1.0000   Median :1.000  
##  Mean   :0.1027   Mean   :0.4791   Mean   :0.6084   Mean   :1.044  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:2.000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :6.000  
##       smsa           northcen         south             west       
##  Min.   :0.0000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :1.0000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.7224   Mean   :0.251   Mean   :0.3555   Mean   :0.1692  
##  3rd Qu.:1.0000   3rd Qu.:0.750   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000  
##     construc          ndurman          trcommpu           trade       
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.00000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.00000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.0000   Median :0.00000   Median :0.0000  
##  Mean   :0.04563   Mean   :0.1141   Mean   :0.04373   Mean   :0.2871  
##  3rd Qu.:0.00000   3rd Qu.:0.0000   3rd Qu.:0.00000   3rd Qu.:1.0000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.00000   Max.   :1.0000  
##     services         profserv         profocc          clerocc      
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.0000   Median :0.0000   Median :0.0000   Median :0.0000  
##  Mean   :0.1008   Mean   :0.2586   Mean   :0.3669   Mean   :0.1673  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.0000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     servocc           lwage            expersq          tenursq       
##  Min.   :0.0000   Min.   :-0.6349   Min.   :   1.0   Min.   :   0.00  
##  1st Qu.:0.0000   1st Qu.: 1.2030   1st Qu.:  25.0   1st Qu.:   0.00  
##  Median :0.0000   Median : 1.5369   Median : 182.5   Median :   4.00  
##  Mean   :0.1407   Mean   : 1.6233   Mean   : 473.4   Mean   :  78.15  
##  3rd Qu.:0.0000   3rd Qu.: 1.9286   3rd Qu.: 676.0   3rd Qu.:  49.00  
##  Max.   :1.0000   Max.   : 3.2181   Max.   :2601.0   Max.   :1936.00
sum(wage1$female)# Number of female in the sample
## [1] 252
526-252 # Number of male in the sample
## [1] 274

C2

bwght <- read.dta("BWGHT.DTA")
## Warning in read.dta("BWGHT.DTA"): cannot read factor labels from Stata 5
## files
str(bwght) # To understand about the structure of the data
## 'data.frame':    1388 obs. of  14 variables:
##  $ faminc  : num  13.5 7.5 0.5 15.5 27.5 7.5 65 27.5 27.5 37.5 ...
##  $ cigtax  : num  16.5 16.5 16.5 16.5 16.5 16.5 16.5 16.5 16.5 16.5 ...
##  $ cigprice: num  122 122 122 122 122 ...
##  $ bwght   : int  109 133 129 126 134 118 140 86 121 129 ...
##  $ fatheduc: int  12 6 NA 12 14 12 16 12 12 16 ...
##  $ motheduc: int  12 12 12 12 12 14 14 14 17 18 ...
##  $ parity  : int  1 2 2 2 2 6 2 2 2 2 ...
##  $ male    : int  1 1 0 1 1 1 0 0 0 0 ...
##  $ white   : int  1 0 0 0 1 0 1 0 1 1 ...
##  $ cigs    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ lbwght  : num  4.69 4.89 4.86 4.84 4.9 ...
##  $ bwghtlbs: num  6.81 8.31 8.06 7.88 8.38 ...
##  $ packs   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ lfaminc : num  2.603 2.015 -0.693 2.741 3.314 ...
##  - attr(*, "datalabel")= chr ""
##  - attr(*, "time.stamp")= chr " 3 Jun 1997 13:47"
##  - attr(*, "formats")= chr  "%9.0g" "%9.0g" "%9.0g" "%8.0g" ...
##  - attr(*, "types")= int  102 102 102 105 98 98 98 98 98 98 ...
##  - attr(*, "val.labels")= chr  "" "" "" "" ...
##  - attr(*, "var.labels")= chr  "1988 family income, $1000s" "cig. tax in home state, 1988" "cig. price in home state, 1988" "birth weight, ounces" ...
##  - attr(*, "version")= int 5
head(bwght)
##   faminc cigtax cigprice bwght fatheduc motheduc parity male white cigs
## 1   13.5   16.5    122.3   109       12       12      1    1     1    0
## 2    7.5   16.5    122.3   133        6       12      2    1     0    0
## 3    0.5   16.5    122.3   129       NA       12      2    0     0    0
## 4   15.5   16.5    122.3   126       12       12      2    1     0    0
## 5   27.5   16.5    122.3   134       14       12      2    1     1    0
## 6    7.5   16.5    122.3   118       12       14      6    1     0    0
##     lbwght bwghtlbs packs    lfaminc
## 1 4.691348   6.8125     0  2.6026897
## 2 4.890349   8.3125     0  2.0149031
## 3 4.859812   8.0625     0 -0.6931472
## 4 4.836282   7.8750     0  2.7408400
## 5 4.897840   8.3750     0  3.3141861
## 6 4.770685   7.3750     0  2.0149031
sum(bwght$male==1)# Count the number of male in the sample
## [1] 723
sum(bwght$male == 0)
## [1] 665
1388-723 # Calculate the number of female in the sample.
## [1] 665
sum(bwght$male == 0 & bwght$cigs > 0) # Count the number of female with smoking 
## [1] 112
summary(bwght) # Find the mean, median, min, max of the variables
##      faminc          cigtax         cigprice         bwght      
##  Min.   : 0.50   Min.   : 2.00   Min.   :103.8   Min.   : 23.0  
##  1st Qu.:14.50   1st Qu.:15.00   1st Qu.:122.8   1st Qu.:107.0  
##  Median :27.50   Median :20.00   Median :130.8   Median :120.0  
##  Mean   :29.03   Mean   :19.55   Mean   :130.6   Mean   :118.7  
##  3rd Qu.:37.50   3rd Qu.:26.00   3rd Qu.:137.0   3rd Qu.:132.0  
##  Max.   :65.00   Max.   :38.00   Max.   :152.5   Max.   :271.0  
##                                                                 
##     fatheduc        motheduc         parity           male       
##  Min.   : 1.00   Min.   : 2.00   Min.   :1.000   Min.   :0.0000  
##  1st Qu.:12.00   1st Qu.:12.00   1st Qu.:1.000   1st Qu.:0.0000  
##  Median :12.00   Median :12.00   Median :1.000   Median :1.0000  
##  Mean   :13.19   Mean   :12.94   Mean   :1.633   Mean   :0.5209  
##  3rd Qu.:16.00   3rd Qu.:14.00   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :18.00   Max.   :18.00   Max.   :6.000   Max.   :1.0000  
##  NA's   :196     NA's   :1                                       
##      white             cigs            lbwght         bwghtlbs     
##  Min.   :0.0000   Min.   : 0.000   Min.   :3.135   Min.   : 1.438  
##  1st Qu.:1.0000   1st Qu.: 0.000   1st Qu.:4.673   1st Qu.: 6.688  
##  Median :1.0000   Median : 0.000   Median :4.787   Median : 7.500  
##  Mean   :0.7846   Mean   : 2.087   Mean   :4.760   Mean   : 7.419  
##  3rd Qu.:1.0000   3rd Qu.: 0.000   3rd Qu.:4.883   3rd Qu.: 8.250  
##  Max.   :1.0000   Max.   :50.000   Max.   :5.602   Max.   :16.938  
##                                                                    
##      packs           lfaminc       
##  Min.   :0.0000   Min.   :-0.6931  
##  1st Qu.:0.0000   1st Qu.: 2.6741  
##  Median :0.0000   Median : 3.3142  
##  Mean   :0.1044   Mean   : 3.0713  
##  3rd Qu.:0.0000   3rd Qu.: 3.6243  
##  Max.   :2.5000   Max.   : 4.1744  
## 
# There is not information have been found about female prenancy. Which is that variable?
# Report about average income and its standard deviation 
sd(bwght$faminc) # Mean of average income is 29.03; Standard deviation of family income is 18.73 dollars
## [1] 18.73928

C3

meap01 <- read.dta("MEAP01.DTA")
summary(meap01$math4)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   61.60   76.40   71.91   87.00  100.00
#1The largest value of math4 is 100, smallest is 0
str(meap01)
## 'data.frame':    1823 obs. of  11 variables:
##  $ dcode  : num  1010 2070 2080 3010 3010 3010 3020 3020 3020 3030 ...
##  $ bcode  : int  4937 597 4860 790 1403 4056 922 2864 4851 881 ...
##  $ math4  : num  83.3 90.3 61.9 85.7 77.3 ...
##  $ read4  : num  77.8 82.3 71.4 60 59.1 ...
##  $ lunch  : num  40.6 27.1 41.8 12.8 17.1 ...
##  $ enroll : int  468 679 400 251 439 561 442 381 274 326 ...
##  $ expend : num  2747475 1505772 2121871 1211034 1913501 ...
##  $ exppp  : num  5871 2218 5305 4825 4359 ...
##  $ lenroll: num  6.15 6.52 5.99 5.53 6.08 ...
##  $ lexpend: num  14.8 14.2 14.6 14 14.5 ...
##  $ lexppp : num  8.68 7.7 8.58 8.48 8.38 ...
##  - attr(*, "datalabel")= chr ""
##  - attr(*, "time.stamp")= chr "22 Dec 2005 10:35"
##  - attr(*, "formats")= chr  "%9.0g" "%9.0g" "%9.0g" "%9.0g" ...
##  - attr(*, "types")= int  102 105 102 102 102 105 102 102 102 102 ...
##  - attr(*, "val.labels")= chr  "" "" "" "" ...
##  - attr(*, "var.labels")= chr  "district code" "building code" "% students satisfactory, 4th grade math" "% students satisfactory, 4th grade reading" ...
##  - attr(*, "version")= int 7
#2
sum(meap01$math4==100) # There are 38 obs with math4 is 100
## [1] 38
38/1823 # The percentage of school with math4 is 100 in total sample
## [1] 0.02084476
#3 Dont understand the question
#4 Compare the average pass rates for math and reading scores
summary(meap01$math4)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   61.60   76.40   71.91   87.00  100.00
summary(meap01$read4)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   48.90   62.70   60.06   73.90  100.00
#5. Correlation between math4 and read4
math4 <- meap01$math4
read4 <- meap01$read4
cor(math4, read4)
## [1] 0.8427281
cor(meap01$math4, meap01$read4) # Highly correlated between math4 and read4
## [1] 0.8427281
#6. Average and standard deviation of "expp"
summary(meap01$exppp)# Mean
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1207    4502    5078    5195    5767   11960
sd(meap01$exppp) # Standard deviation
## [1] 1091.89
#7. Dont understand the question

C4

jtrain2 <- read.dta("JTRAIN2.DTA")
## Warning in read.dta("JTRAIN2.DTA"): cannot read factor labels from Stata 5
## files

C5

fertil2 <- read.dta("FERTIL2.DTA")
## Warning in read.dta("FERTIL2.DTA"): cannot read factor labels from Stata 5
## files
#1 
summary(fertil2$children)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   0.000   2.000   2.268   4.000  13.000
#2 The number of women with electricity == 1
library(tidyverse)
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag():    dplyr, stats
library()
sum(fertil2$electric ==1, na.rm = TRUE) # There is 611 obs with electricity. The percentage is 611/4361 and it is equal to 
## [1] 611
(611/4361)*100 # 14.01%
## [1] 14.01055
#3 Not done yet

sum(fertil2$children>0 & fertil2$electric == 1, na.rm = TRUE)
## [1] 450
sum(fertil2$children == 0)
## [1] 1132
sum(fertil2$children > 0)
## [1] 3229
sum(fertil2$chileren > 0 & fertil2$electric == 0, na.rm = TRUE)
## [1] 0