exercise 1

# Load the wooldridge library
library(wooldridge)

# Load the BWGHT dataset
data("bwght")

#C2
#(i) How many women are in the sample, and how many report smoking during pregnancy?
total_women <- nrow(bwght)
smoking_women <- sum(bwght$cigs > 0)
cat("(i) Total women in the sample:", total_women, "\n")

## (i) Total women in the sample: 1388

cat("    Women who report smoking during pregnancy:", smoking_women, "\n")

##     Women who report smoking during pregnancy: 212

#(ii) What is the average number of cigarettes smoked per day?
average_cigarettes <- mean(bwght$cigs)
cat("\n(ii) Average number of cigarettes smoked per day:", average_cigarettes, "\n")

## 
## (ii) Average number of cigarettes smoked per day: 2.087176

#(ii) Is the average a good measure of the "typical" woman in this case? Explain.
cat("It would not be a good measure to use the average of only one variable, where only 212 observations have a value greater than zero to define 'typical' women.")

## It would not be a good measure to use the average of only one variable, where only 212 observations have a value greater than zero to define 'typical' women.

#(iii) Among women who smoked during pregnancy, what is the average number of cigarettes smoked per day?
# Filter the data set to include only women who smoked during pregnancy (cigs > 0)
smoking_women <- bwght[bwght$cigs > 0, ]

# Calculate the average number of cigarettes smoked per day among smoking women
average_cigs_smoked <- sum(smoking_women$cigs) / nrow(smoking_women)

cat("Average number of cigarettes smoked per day among smoking women:", average_cigs_smoked, "\n")

## Average number of cigarettes smoked per day among smoking women: 13.66509

#(iii) Compare this with the answer from part (ii) and explain why they might differ.
cat("Excluding women who didn't smoke during pregnancy from the calculation of average number of cigarettes smoked per day has resulted in a much realistic value.")

## Excluding women who didn't smoke during pregnancy from the calculation of average number of cigarettes smoked per day has resulted in a much realistic value.

#(iv) Find the average of fatheduc in the sample.

# Calculate the average of fatheduc in the sample while omitting NA values
average_fatheduc <- mean(bwght$fatheduc, na.rm = TRUE)

cat("\n(iv) Average of fatheduc in the sample (omitting NAs):", average_fatheduc, "\n")

## 
## (iv) Average of fatheduc in the sample (omitting NAs): 13.18624

#(iv) Why are only 1,192 observations used to compute this average?
cat("There are 196 observations that do not have any specific value (represented as NA) in the variable fatheduc, which reduces the sample size.")

## There are 196 observations that do not have any specific value (represented as NA) in the variable fatheduc, which reduces the sample size.

#(v) Report the average family income and its standard deviation in dollars.
average_income <- mean(bwght$faminc)
std_dev_income <- sd(bwght$faminc)
cat("\n(v) Average family income in dollars:", average_income, "\n")

## 
## (v) Average family income in dollars: 29.02666

cat("    Standard deviation of family income in dollars:", std_dev_income, "\n")

##     Standard deviation of family income in dollars: 18.73928

#C3
# Load the wooldridge library
library(wooldridge)

# Load the MEAP01 dataset
data("meap01")

# (i) Find the largest and smallest values of math4. Does the range make sense? Explain.
max_math4 <- max(meap01$math4)
min_math4 <- min(meap01$math4)
cat("(i) Largest value of math4:", max_math4, "\n")

## (i) Largest value of math4: 100

cat("    Smallest value of math4:", min_math4, "\n")

##     Smallest value of math4: 0

cat("    Range of math4 (largest - smallest):", max_math4 - min_math4, "\n")

##     Range of math4 (largest - smallest): 100

# (ii) How many schools have a perfect pass rate on the math test? What percentage is this of the total sample?
perfect_pass_rate <- sum(meap01$math4 == 100)
total_schools <- nrow(meap01)
percentage_perfect_pass <- (perfect_pass_rate / total_schools) * 100
cat("\n(ii) Number of schools with a perfect pass rate on math test:", perfect_pass_rate, "\n")

## 
## (ii) Number of schools with a perfect pass rate on math test: 38

cat("     Percentage of total sample with perfect pass rate:", percentage_perfect_pass, "%\n")

##      Percentage of total sample with perfect pass rate: 2.084476 %

# (iii) How many schools have math pass rates of exactly 50%?
schools_50_percent <- sum(meap01$math4 == 50)
cat("\n(iii) Number of schools with math pass rates of exactly 50%:", schools_50_percent, "\n")

## 
## (iii) Number of schools with math pass rates of exactly 50%: 17

# (iv) Compare the average pass rates for the math and reading scores. Which test is harder to pass?
average_math_pass_rate <- mean(meap01$math4)
average_reading_pass_rate <- mean(meap01$read4)
cat("\n(iv) Average pass rate for math test:", average_math_pass_rate, "\n")

## 
## (iv) Average pass rate for math test: 71.909

cat("     Average pass rate for reading test:", average_reading_pass_rate, "\n")

##      Average pass rate for reading test: 60.06188

# (v) Find the correlation between math4 and read4. What do you conclude?
correlation_math_read <- cor(meap01$math4, meap01$read4)
cat("\n(v) Correlation between math4 and read4:", correlation_math_read, "\n")

## 
## (v) Correlation between math4 and read4: 0.8427281

cat("    The correlation indicates the strength and direction of the linear relationship between math and reading scores.\n")

##     The correlation indicates the strength and direction of the linear relationship between math and reading scores.

# (vi) The variable exppp is expenditure per pupil. Find the average of exppp along with its standard deviation. Would you say there is wide variation in per pupil spending?
average_exppp <- mean(meap01$exppp)
std_dev_exppp <- sd(meap01$exppp)
cat("\n(vi) Average expenditure per pupil:", average_exppp, "\n")

## 
## (vi) Average expenditure per pupil: 5194.865

cat("     Standard deviation of expenditure per pupil:", std_dev_exppp, "\n")

##      Standard deviation of expenditure per pupil: 1091.89

cat("     The standard deviation indicates the degree of variation in per pupil spending. A larger standard deviation suggests wider variation.\n")

##      The standard deviation indicates the degree of variation in per pupil spending. A larger standard deviation suggests wider variation.

# (vii) Suppose School A spends $6,000 per student and School B spends $5,500 per student. Calculate the percentage difference.
school_A_spending <- 6000
school_B_spending <- 5500
percentage_difference <- ((school_A_spending - school_B_spending) / school_B_spending) * 100
approx_percentage_difference <- 100 * (log(school_A_spending) - log(school_B_spending))
cat("\n(vii) Percentage difference between School A and School B spending:", percentage_difference, "%\n")

## 
## (vii) Percentage difference between School A and School B spending: 9.090909 %

cat("     Approximate percentage difference based on natural logs:", approx_percentage_difference, "%\n")

##      Approximate percentage difference based on natural logs: 8.701138 %

exercise 1

Misheel 112035128

2024-09-22