#Chapter2 C9.

##Use the data in COUNTYMURDERS to answer this questions. Use only the data for 1996. ##(I) How many counties had zero murders in 1996? How many counties had at least one execution? What is the largest number of executions?

library(wooldridge)
data("countymurders")
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
countymurders1996 <-countymurders %>% filter(year== 1996) 
head(countymurders1996,10)
##    arrests countyid   density  popul perc1019 perc2029 percblack percmale
## 1        8     1001  67.21535  40061 15.89077 13.17491 20.975510 48.70073
## 2        6     1003  77.05643 123023 13.93886 11.63929 13.496660 48.83233
## 3        1     1005  29.91548  26475 15.06327 13.69972 46.190750 49.15203
## 4        0     1009  67.20457  43392 14.17542 12.99318  1.415007 48.97446
## 5        1     1011  17.89899  11188 14.98927 14.13121 72.756520 49.91956
## 6        2     1013  27.71148  21530 15.68509 11.25871 41.384110 46.81839
## 7       20     1015 186.53970 113511 14.71135 14.28936 19.096830 47.99447
## 8        4     1017  61.51258  36748 14.65386 13.13813 37.253730 47.31142
## 9        2     1019  38.27024  21170 14.13321 12.13037  7.042985 49.22060
## 10       0     1021  50.89291  35323 14.80339 12.64332 11.921410 48.60006
##    rpcincmaint rpcpersinc rpcunemins year murders  murdrate arrestrate
## 1      192.038  11852.760     26.796 1996       7 1.7473350  1.9969550
## 2      139.084  13583.020     28.710 1996       6 0.4877137  0.4877137
## 3      405.768  10760.510     63.162 1996       1 0.3777148  0.3777148
## 4      184.382  11094.820     21.692 1996       2 0.4609145  0.0000000
## 5      485.518   8349.506     63.162 1996       0 0.0000000  0.8938148
## 6      357.918   9947.058     54.868 1996       2 0.9289364  0.9289364
## 7      248.820  11536.320     35.090 1996      14 1.2333610  1.7619440
## 8      243.078  10899.590     41.470 1996       3 0.8163710  1.0884950
## 9      200.970   9806.698     26.796 1996       0 0.0000000  0.9447331
## 10     231.594  10819.840     40.194 1996       0 0.0000000  0.0000000
##    statefips countyfips execs    lpopul execrate
## 1          1          1     0 10.598160        0
## 2          1          3     0 11.720130        0
## 3          1          5     0 10.183960        0
## 4          1          9     0 10.678030        0
## 5          1         11     0  9.322598        0
## 6          1         13     0  9.977202        0
## 7          1         15     0 11.639660        0
## 8          1         17     0 10.511840        0
## 9          1         19     0  9.960340        0
## 10         1         21     0 10.472290        0
zero_murders_count <- sum(countymurders1996$murders == 0)
cat("Counties with zero murders in 1996:", zero_murders_count, "\n")
## Counties with zero murders in 1996: 1051
counties_with_execution <- sum(countymurders1996$execs >= 1)
cat("Counties with at least one execution in 1996:", counties_with_execution, "\n")
## Counties with at least one execution in 1996: 31
max_executions <- max(countymurders1996$execs)
cat("Largest number of executions in 1996:", max_executions, "\n")
## Largest number of executions in 1996: 3

###ANS: According to the 1996 data in COUNTYMURDERS, 1,051 counties reported zero murders. Additionally, 31 counties recorded at least one execution, with the highest number of executions being three.

##(II)Estimate the equation murders = Bo + B1xecs + u by OLS and report the results in the usual way, including sample size and R-squared.

model <- lm(murders ~ execs, data = subset(countymurders1996))
summary(model)
## 
## Call:
## lm(formula = murders ~ execs, data = subset(countymurders1996))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -149.12   -5.46   -4.46   -2.46 1338.99 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   5.4572     0.8348   6.537 7.79e-11 ***
## execs        58.5555     5.8333  10.038  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 38.89 on 2195 degrees of freedom
## Multiple R-squared:  0.04389,    Adjusted R-squared:  0.04346 
## F-statistic: 100.8 on 1 and 2195 DF,  p-value: < 2.2e-16

##(III)Interpret the slope coefficient reported in part (ii). Does the estimated equation suggest a deterrent effect of capital punishment?

cat("The slope coefficient (ß1) represents the change in murders for a one-unit change in executions.\n")
## The slope coefficient (ß1) represents the change in murders for a one-unit change in executions.
cat("If ß1 is negative, it suggests a deterrent effect of capital punishment.\n")
## If ß1 is negative, it suggests a deterrent effect of capital punishment.

##(IV)What is the smallest number of murders that can be predicted by the equation? What is the residual for a county with zero executions and zero murders?

min_executions <- min(countymurders$execs)
min_predicted_murders <- coef(model)[1] + coef(model)[2] * min_executions
cat("Smallest number of murders predicted:", min_predicted_murders, "\n")
## Smallest number of murders predicted: 5.457241
zero_exec_zero_murders_residual <- predict(model, newdata = data.frame(execs = 0), type = "response")
cat("Residual for a county with zero executions and zero murders:", zero_exec_zero_murders_residual, "\n")
## Residual for a county with zero executions and zero murders: 5.457241

##(v)Explain why a simple regression analysis is not well suited for determining whether capital punishment has a deterrent effect on murders.

cat("A simple regression analysis may suffer from omitted variable bias and endogeneity issues.\n")
## A simple regression analysis may suffer from omitted variable bias and endogeneity issues.
cat("Factors other than executions could influence the murder rate, leading to biased estimates.\n")
## Factors other than executions could influence the murder rate, leading to biased estimates.
cat("Additionally, the decision to implement capital punishment may be influenced by the crime rate,\n")
## Additionally, the decision to implement capital punishment may be influenced by the crime rate,
cat("creating endogeneity problems and making causal inference challenging.\n")
## creating endogeneity problems and making causal inference challenging.

#Chapter3 5. ##In a study relating college grade point average to time spent in various activities, you distribute a survey to several students. The students are asked how many hours they spend each week in four activi-ties: studying, sleeping, working, and leisure. Any activity is put into one of the four categories, so that for each student, the sum of hours in the four activities must be 168. ##(I)In the model GPA = Bo + B,study + Bosleep + Bzwork + Baleisure + u,does it make sense to hold sleep, work, and leisure fixed, while changing study?

library(wooldridge)
data("gpa1")
head(gpa1,10)
##    age soph junior senior senior5 male campus business engineer colGPA hsGPA
## 1   21    0      0      1       0    0      0        1        0    3.0   3.0
## 2   21    0      0      1       0    0      0        1        0    3.4   3.2
## 3   20    0      1      0       0    0      0        1        0    3.0   3.6
## 4   19    1      0      0       0    1      1        1        0    3.5   3.5
## 5   20    0      1      0       0    0      0        1        0    3.6   3.9
## 6   20    0      0      1       0    1      1        1        0    3.0   3.4
## 7   22    0      0      0       1    0      0        1        0    2.7   3.5
## 8   22    0      0      0       1    0      0        0        0    2.7   3.0
## 9   22    0      0      0       1    0      0        0        0    2.7   3.0
## 10  19    1      0      0       0    0      0        1        0    3.8   4.0
##    ACT job19 job20 drive bike walk voluntr PC greek car siblings bgfriend clubs
## 1   21     0     1     1    0    0       0  0     0   1        1        0     0
## 2   24     0     1     1    0    0       0  0     0   1        0        1     1
## 3   26     1     0     0    0    1       0  0     0   1        1        0     1
## 4   27     1     0     0    0    1       0  0     0   0        1        0     0
## 5   28     0     1     0    1    0       0  0     0   1        1        1     0
## 6   25     0     0     0    0    1       0  0     0   1        1        0     0
## 7   25     0     0     0    1    0       0  0     1   1        1        0     1
## 8   22     1     0     1    0    0       0  1     0   0        1        1     0
## 9   21     1     0     1    0    0       0  0     0   1        1        1     1
## 10  27     1     0     0    0    1       0  1     0   0        1        0     1
##    skipped alcohol gradMI fathcoll mothcoll
## 1      2.0    1.00      1        0        0
## 2      0.0    1.00      1        1        1
## 3      0.0    1.00      1        1        1
## 4      0.0    0.00      0        0        0
## 5      0.0    1.50      1        1        0
## 6      0.0    0.00      0        1        0
## 7      0.0    2.00      1        0        1
## 8      3.0    3.00      1        1        1
## 9      2.0    2.50      1        1        1
## 10     0.5    0.75      1        0        1
cat("In the given model, it does not make sense to hold sleep, work, and leisure fixed while changing study.\n")
## In the given model, it does not make sense to hold sleep, work, and leisure fixed while changing study.
cat("The reason is that the sum of hours in all four activities must be 168 for each student.\n")
## The reason is that the sum of hours in all four activities must be 168 for each student.
cat("Changing the hours spent on studying would inherently change the hours available for other activities.\n\n")
## Changing the hours spent on studying would inherently change the hours available for other activities.

##(II) Explain why this model violates Assumption MLR.3.

cat("This model violates Assumption MLR.3, which assumes that the regressors are fixed and non-stochastic.\n")
## This model violates Assumption MLR.3, which assumes that the regressors are fixed and non-stochastic.
cat("In this case, the hours spent on different activities are not fixed; they must sum up to 168, which introduces\n")
## In this case, the hours spent on different activities are not fixed; they must sum up to 168, which introduces
cat("stochasticity and correlation among the explanatory variables.\n\n")
## stochasticity and correlation among the explanatory variables.

##(III) How could you reformulate the model so that its parameters have a useful interpretation and it satisfies Assumption MLR.3?

cat("To satisfy Assumption MLR.3, you could reformulate the model by using a set of independent variables that\n")
## To satisfy Assumption MLR.3, you could reformulate the model by using a set of independent variables that
cat("are not constrained to sum to a fixed value. For example, you could use the hours spent on three activities\n")
## are not constrained to sum to a fixed value. For example, you could use the hours spent on three activities
cat("as independent variables, and the fourth one can be derived from the constraint (168 - study - work - leisure).\n")
## as independent variables, and the fourth one can be derived from the constraint (168 - study - work - leisure).

#Chapter3 10 ##Suppose that you are interested in estimating the ceteris paribus relationship between y and xj. For this purpose, you can collect data on two control variables, * and . (For concreteness, you might think of y as final exam score, as class attendance, * as GPA up through the previous semester, and X3 as SAT or ACT score.) Let B, be the simple regression estimate from y on * and let B, be the multiple regression estimate from y on X1, X2, X3.

library(wooldridge)
data("mroz")
head(mroz,10)
##    inlf hours kidslt6 kidsge6 age educ   wage repwage hushrs husage huseduc
## 1     1  1610       1       0  32   12 3.3540    2.65   2708     34      12
## 2     1  1656       0       2  30   12 1.3889    2.65   2310     30       9
## 3     1  1980       1       3  35   12 4.5455    4.04   3072     40      12
## 4     1   456       0       3  34   12 1.0965    3.25   1920     53      10
## 5     1  1568       1       2  31   14 4.5918    3.60   2000     32      12
## 6     1  2032       0       0  54   12 4.7421    4.70   1040     57      11
## 7     1  1440       0       2  37   16 8.3333    5.95   2670     37      12
## 8     1  1020       0       0  54   12 7.8431    9.98   4120     53       8
## 9     1  1458       0       2  48   12 2.1262    0.00   1995     52       4
## 10    1  1600       0       2  39   12 4.6875    4.15   2100     43      12
##    huswage faminc    mtr motheduc fatheduc unem city exper  nwifeinc      lwage
## 1   4.0288  16310 0.7215       12        7  5.0    0    14 10.910060 1.21015370
## 2   8.4416  21800 0.6615        7        7 11.0    1     5 19.499981 0.32851210
## 3   3.5807  21040 0.6915       12        7  5.0    0    15 12.039910 1.51413774
## 4   3.5417   7300 0.7815        7        7  5.0    0     6  6.799996 0.09212332
## 5  10.0000  27300 0.6215       12       14  9.5    1     7 20.100058 1.52427220
## 6   6.7106  19495 0.6915       14        7  7.5    1    33  9.859054 1.55648005
## 7   3.4277  21152 0.6915       14        7  5.0    0    11  9.152048 2.12025952
## 8   2.5485  18900 0.6915        3        3  5.0    0    35 10.900038 2.05963421
## 9   4.2206  20405 0.7515        7        7  3.0    0    24 17.305000 0.75433636
## 10  5.7143  20425 0.6915        7        7  5.0    0    21 12.925000 1.54489934
##    expersq
## 1      196
## 2       25
## 3      225
## 4       36
## 5       49
## 6     1089
## 7      121
## 8     1225
## 9      576
## 10     441

##(I) If x, is highly correlated with z and ; in the sample, and z and have large partial effects on y, would you expect B, and B, to be similar or very different? Explain.

cat("If x1 is highly correlated with x2 and x3 and x2 and x3 have large partial effects on y,\n")
## If x1 is highly correlated with x2 and x3 and x2 and x3 have large partial effects on y,
cat("you would expect (B1 with ~ sign) and (adjusted B1) to be similar. The inclusion of x2 and x3 in the model\n")
## you would expect (B1 with ~ sign) and (adjusted B1) to be similar. The inclusion of x2 and x3 in the model
cat("should help in capturing the relationship between x1 and y more accurately, resulting in a similar effect.\n\n")
## should help in capturing the relationship between x1 and y more accurately, resulting in a similar effect.

##. (II) If x, is almost uncorrelated with xz and xs, but xz and 3 are highly correlated, will B, and B, tend to be similar or very different? Explain.

cat("If x1 is almost uncorrelated with x2 and x3 but x2 and x3 are highly correlated,\n")
## If x1 is almost uncorrelated with x2 and x3 but x2 and x3 are highly correlated,
cat("(B1 with ~ sign) and (adjusted B1) tend to be similar. The high correlation between x2 and x3\n")
## (B1 with ~ sign) and (adjusted B1) tend to be similar. The high correlation between x2 and x3
cat("may result in multicollinearity issues, leading to unstable coefficient estimates for x2 and x3.\n\n")
## may result in multicollinearity issues, leading to unstable coefficient estimates for x2 and x3.

##(III) If, is highly correlated with z and X3, and Xz and 3 have small partial effects on y, would you expect se(B,) or se(B,) to be smaller? Explain.

cat("If x1 is highly correlated with x2 and x3, and x2 and x3 have small partial effects on y,\n")
## If x1 is highly correlated with x2 and x3, and x2 and x3 have small partial effects on y,
cat("you would expect se(B₁ with ~ sign) to be smaller. The high correlation can lead to\n")
## you would expect se(B₁ with ~ sign) to be smaller. The high correlation can lead to
cat("multicollinearity, inflating standard errors for the individual coefficients.\n\n")
## multicollinearity, inflating standard errors for the individual coefficients.

##(IV) If x, is almost uncorrelated with x and 3, 2 and x; have large partial effects on y, and xz and 3 are highly correlated, would you expect se(B,) or se (B,) to be smaller? Explain.

cat("If x1 is almost uncorrelated with x2 and x3, x2 and x3 have large partial effects on y,\n")
## If x1 is almost uncorrelated with x2 and x3, x2 and x3 have large partial effects on y,
cat("and x2 and x3 are highly correlated, you would expect se(adjusted B₁) to be smaller.\n")
## and x2 and x3 are highly correlated, you would expect se(adjusted B₁) to be smaller.
cat("The inclusion of highly correlated variables (x2 and x3) without much correlation with x1\n")
## The inclusion of highly correlated variables (x2 and x3) without much correlation with x1
cat("can improve the precision of the estimates for x1, resulting in smaller standard errors.\n")
## can improve the precision of the estimates for x1, resulting in smaller standard errors.

#Chapter 3 C8 ##Use the data in DISCRIM to answer this question. These are ZIP code- level data on prices for various items at fast-food restaurants, along with characteristics of the zip code population, in New Jersey and Pennsylvania. The idea is to see whether fast-food restaurants charge higher prices in areas with a larger concentration of blacks. ##(I) Find the average values of prpblck and income in the sample, along with their standard devia-tions. What are the units of measurement of prpblck and income?

library(wooldridge)

data("discrim")

mean_prpblck <- mean(discrim$prpblck)
sd_prpblck <- sd(discrim$prpblck)
mean_income <- mean(discrim$income)
sd_income <- sd(discrim$income)
  cat("Average prpblck:", mean_prpblck, "\n")
## Average prpblck: NA
data.frame(
  "Mean" = c(
    format((mean(discrim$prpblck, na.rm = T)), scientific = F),
              as.integer(mean(discrim$income, na.rm = T))),
  "SD" = c(
     format((sd(discrim$prpblck, na.rm = T)), scientific = F),
     as.integer(sd(discrim$income, na.rm = T))),
  row.names = c("prpblck", "income")
  )
##              Mean        SD
## prpblck 0.1134864 0.1824165
## income      47053     13179

##ANS:The averages for “prpblck” and “income” are 0.113 and 47,053, respectively, with standard deviations of 0.1824 and 13,179.29. It’s clear that “prpblck” represents the proportion of the Black population, while “income” is measured in dollars. ##(II) Consider a model to explain the price of soda, psoda, in terms of the proportion of the population that is black and median income: psoda = b0 + b1prpblck + b2income + u. Estimate this model by OLS and report the results in equation form, including the sample size and R-squared. (Do not use scientific notation when reporting the estimates.) Interpret the coefficient on prpblck. Do you think it is economically large?

aapl <- lm(psoda~prpblck+income, data = discrim)
summary(aapl)
## 
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29401 -0.05242  0.00333  0.04231  0.44322 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.563e-01  1.899e-02  50.354  < 2e-16 ***
## prpblck     1.150e-01  2.600e-02   4.423 1.26e-05 ***
## income      1.603e-06  3.618e-07   4.430 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08611 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06422,    Adjusted R-squared:  0.05952 
## F-statistic: 13.66 on 2 and 398 DF,  p-value: 1.835e-06

##ANs:The resulting regression is psoda.hat = (0.956) + (0.115)prpblck + (0.0000016). The optimal sample size is 399 observations (indicated by the 398 degrees of freedom and 9 missing observations) and the adjusted R^2 is 0.595. The coefficient on prpblck indicates that, all things being equal, if prpblck increases by 10% the price of soda will increase by approximately 1.2 cents, which is not economically significant.

##(III)Compare the estimate from part (ii) with the simple regression estimate from psoda on prpblck. Is the discrimination effect larger or smaller when you control for income?

v <- lm(psoda~prpblck, data = discrim)
v
## 
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
## 
## Coefficients:
## (Intercept)      prpblck  
##     1.03740      0.06493
summary(v)
## 
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.30884 -0.05963  0.01135  0.03206  0.44840 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.03740    0.00519  199.87  < 2e-16 ***
## prpblck      0.06493    0.02396    2.71  0.00702 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0881 on 399 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.01808,    Adjusted R-squared:  0.01561 
## F-statistic: 7.345 on 1 and 399 DF,  p-value: 0.007015

##ANS:The estimated coefficient on prpblackprpblack in the simple regression is 0.065, which is lower than the previous estimate. This suggests that the effect of discrimination diminishes when income is not included in the model.

##(IV) A model with a constant price elasticity with respect to income may be more appropriate. Report estimates of the model log1psoda2 = b0 + b1prpblck + b2log1income2 + u. If prpblck increases by .20 (20 percentage points), what is the estimated percentage change in psoda? (Hint: The answer is 2.xx, where you fill in the “xx.”)

lpsoda <- lm(log(psoda)~prpblck+log(income), data = discrim)
lpsoda
## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
## 
## Coefficients:
## (Intercept)      prpblck  log(income)  
##    -0.79377      0.12158      0.07651
summary(lpsoda)
## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33563 -0.04695  0.00658  0.04334  0.35413 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.79377    0.17943  -4.424 1.25e-05 ***
## prpblck      0.12158    0.02575   4.722 3.24e-06 ***
## log(income)  0.07651    0.01660   4.610 5.43e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0821 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06809,    Adjusted R-squared:  0.06341 
## F-statistic: 14.54 on 2 and 398 DF,  p-value: 8.039e-07
paste( (0.2*100)*0.122, "percent increase")
## [1] "2.44 percent increase"

##ANS:If “prpblck” increases by 20 percentage points, the estimated psodapsoda will rise by approximately 2.44%.

##(V)Now add the variable prppov to the regression in part (iv). What happens to b^ prpblck?

sq <- lm(log(psoda)~prpblck+log(income)+prppov, data = discrim)
sq
## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
## 
## Coefficients:
## (Intercept)      prpblck  log(income)       prppov  
##    -1.46333      0.07281      0.13696      0.38036

##ANS:Including “prppov” in the model reduces the coefficient of “prpblck” to 0.0738.

##(VI)Find the correlation between log(income) and prppov. Is it roughly what you expected?

cor(log(discrim$income), discrim$prppov, method = "pearson", use = "complete.obs")
## [1] -0.838467

##ANS:The correlation is approximately -0.838, which makes sense, as lower income levels would be expected to correspond with higher poverty rates.

##(VII)Evaluate the following statement: “Because log(income) and prppov are so highly correlated, they have no business being in the same regression.” ##ANS: While they are highly correlated, including both variables does not lead to perfect collinearity. Instead, it enhances the model by adding another control variable that helps to isolate the effect of discrimination.

#Chapter 4 3. ##The variable rintens is expenditures on research and development (R&D) as a percentage of sales. Sales are measured in millions of dollars. The variable profmarg is profits as a percentage of sales. Using the data in RDCHEM for 32 firms in the chemical industry, the following equation is estimated: rintens = .472 + .321 log(sales) + .050 profmarg (1.369) (.216) (.046) n = 32, R2 = .099. ##(I) Interpret the coefficient on log(sales). In particular, if sales increases by 10%, what is the estimated percentage point change in rdintens? Is this an economically large effect?

coeff_log_sales <- 0.321
se_log_sales <- 0.216
coeff_profmarg <- 0.50
se_profmarg <- 0.46


percentage_change <- coeff_log_sales * 10
cat("I) Estimated percentage point change in Rdintens for a 10% increase in sales:", percentage_change, "\n")
## I) Estimated percentage point change in Rdintens for a 10% increase in sales: 3.21

##(II)Test the hypothesis that R&D intensity does not change with sales against the alternative that it does increase with sales. Do the test at the 5% and 10% levels.

t_stat_log_sales <- coeff_log_sales / se_log_sales
p_value_log_sales <- 2 * (1 - pt(abs(t_stat_log_sales), df = 29))  # two-tailed test
cat("II) p-value for the test on log(sales) coefficient:", p_value_log_sales, "\n")
## II) p-value for the test on log(sales) coefficient: 0.1480413
cat("   (At 5% level):", ifelse(p_value_log_sales < 0.05, "Reject H0", "Fail to reject H0"), "\n")
##    (At 5% level): Fail to reject H0
cat("   (At 10% level):", ifelse(p_value_log_sales < 0.10, "Reject H0", "Fail to reject H0"), "\n")
##    (At 10% level): Fail to reject H0

##(III)Interpret the coefficient on profmarg. Is it economically large?

cat("III) Coefficient on profmarg:", coeff_profmarg, "\n")
## III) Coefficient on profmarg: 0.5

##ANS:Coefficient on profmarg is not economically large. ##(IV)Does profmarg have a statistically significant effect on rdintens ?

t_stat_profmarg <- coeff_profmarg / se_profmarg
p_value_profmarg <- 2 * (1 - pt(abs(t_stat_profmarg), df = 29))  # two-tailed test
cat("d) p-value for the test on profmarg coefficient:", p_value_profmarg, "\n")
## d) p-value for the test on profmarg coefficient: 0.2860082
cat("   (At 5% level):", ifelse(p_value_profmarg < 0.05, "Reject H0", "Fail to reject H0"), "\n")
##    (At 5% level): Fail to reject H0
cat("   (At 10% level):", ifelse(p_value_profmarg < 0.10, "Reject H0", "Fail to reject H0"), "\n")
##    (At 10% level): Fail to reject H0

##ANS:The p-value for the test on the profmargprofmarg coefficient is 0.2860082.At the 5% significance level, we fail to reject the null hypothesis (H0H0), indicating that there is not enough evidence to conclude that the profmargprofmarg coefficient is significantly different from zero. Similarly, at the 10% significance level, we also fail to reject H0H0.This suggests that the profmargprofmarg variable may not have a statistically significant impact on the dependent variable within the model.

#Chapter4 C8. ##The data set 401KSUBS contains information on net financial wealth (nettfa), age of the survey respondent (age), annual family income (inc), family size (fize), and participation in certain pension plans for people in the United States. The wealth and income variables are both recorded in thousands of dollars. For this question, use only the data for single-person households (so fsize = 1). ##(I) How many single-person households are there in the data set?

library(wooldridge)

# Load the dataset
data("k401ksubs")

# (i) How many single-person households are there in the data set?
single_person_households <- subset(k401ksubs, fsize == 1)
num_single_person_households <- nrow(single_person_households)
cat("Number of single-person households:", num_single_person_households, "\n\n")
## Number of single-person households: 2017

##(II) Use OLS to estimate the model nettfa = Bo + B,inc + Brage + u, and report the results using the usual format. Be sure to use only the single-person households in the sample. Interpret the slope coefficients. Are there any surprises in the slope estimates?

model <- lm(nettfa ~ inc + age, data = single_person_households)
summary(model)
## 
## Call:
## lm(formula = nettfa ~ inc + age, data = single_person_households)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -179.95  -14.16   -3.42    6.03 1113.94 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -43.03981    4.08039 -10.548   <2e-16 ***
## inc           0.79932    0.05973  13.382   <2e-16 ***
## age           0.84266    0.09202   9.158   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.68 on 2014 degrees of freedom
## Multiple R-squared:  0.1193, Adjusted R-squared:  0.1185 
## F-statistic: 136.5 on 2 and 2014 DF,  p-value: < 2.2e-16
cat("Interpretation of slope coefficients:\n")
## Interpretation of slope coefficients:
cat("B1 (inc): The estimated change in nettfa for a one-unit change in inc (annual family income).\n")
## B1 (inc): The estimated change in nettfa for a one-unit change in inc (annual family income).
cat("B2 (age): The estimated change in nettfa for a one-unit change in age.\n")
## B2 (age): The estimated change in nettfa for a one-unit change in age.
cat("There might be surprises depending on the context and expectations of the relationship between variables.\n\n")
## There might be surprises depending on the context and expectations of the relationship between variables.

##(III)Does the intercept from the regression in part (ii) have an interesting meaning? Explain.

cat("The intercept (B0) represents the estimated net financial wealth (nettfa) when both inc and age are zero.\n")
## The intercept (B0) represents the estimated net financial wealth (nettfa) when both inc and age are zero.
cat("In this context, it may not have a meaningful interpretation, as having zero income and age is not practically meaningful.\n\n")
## In this context, it may not have a meaningful interpretation, as having zero income and age is not practically meaningful.

##(IV)Find the p-value for the test Ho: ß2 = 1 against H: B2 < 1. Do you reject Ho at the 1% significance level?

test_result <- summary(model)$coefficient[3, "Pr(>|t|)"]
cat("p-value for the test H0: B₂ = 1 against H₁: B₂ < 1:", test_result, "\n")
## p-value for the test H0: B₂ = 1 against H₁: B₂ < 1: 1.265959e-19
cat("At the 1% significance level, we would reject H0 if the p-value is less than 0.01.\n")
## At the 1% significance level, we would reject H0 if the p-value is less than 0.01.
if (test_result < 0.01) {
  cat("We reject H0; there is evidence that B₂ is less than 1.\n\n")
} else {
  cat("We do not reject H0; there is not enough evidence to conclude that B₂ is less than 1.\n\n")
}
## We reject H0; there is evidence that B₂ is less than 1.

##(V) If you do a simple regression of nettfa on inc, is the estimated coefficient on inc much different from the estimate in part (ii)? Why or why not?

simple_model <- lm(nettfa ~ inc, data = single_person_households)
summary(simple_model)
## 
## Call:
## lm(formula = nettfa ~ inc, data = single_person_households)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -185.12  -12.85   -4.85    1.78 1112.66 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -10.5709     2.0607   -5.13 3.18e-07 ***
## inc           0.8207     0.0609   13.48  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.59 on 2015 degrees of freedom
## Multiple R-squared:  0.08267,    Adjusted R-squared:  0.08222 
## F-statistic: 181.6 on 1 and 2015 DF,  p-value: < 2.2e-16
cat("Comparison of the estimated coefficient on inc:\n")
## Comparison of the estimated coefficient on inc:
cat("The estimated coefficient on inc in the simple regression is compared to the estimate in part (ii).\n")
## The estimated coefficient on inc in the simple regression is compared to the estimate in part (ii).
cat("Differences may arise due to the inclusion of age in the multiple regression model, which may affect\n")
## Differences may arise due to the inclusion of age in the multiple regression model, which may affect
cat("the relationship between nettfa and inc. The context and goals of the analysis will determine\n")
## the relationship between nettfa and inc. The context and goals of the analysis will determine
cat("whether the inclusion of age improves the model.\n")
## whether the inclusion of age improves the model.

#Chapter 5 5 ##The following histogram was created using the variable score in the data file ECONMATH. Thirty bins were used to create the histogram, and the height of each cell is the proportion of observations falling within the corresponding interval. The best-fitting normal distribution-that is, using the sample mean and sample standard deviation-has been superimposed on the histogram. ##(I) If you use the normal distribution to estimate the probability that score exceeds 100, would the answer be zero? Why does your answer contradict the assumption of a normal distribution for score?

library(wooldridge)
# Load the dataset
data('econmath')
# Load necessary libraries
library(ggplot2)
library(stats)

# Assuming "ECONMATH" is your data frame and "score" is the variable of interest
data <- econmath$score

# Create a histogram
hist_data <- hist(data, breaks = 30, plot = FALSE)

# Fit a normal distribution
mu <- mean(data)
sigma <- sd(data)
x <- seq(min(data), max(data), length = 100)
y <- dnorm(x, mean = mu, sd = sigma)

# Plot the histogram and the fitted normal distribution
hist_plot <- ggplot() +
  geom_histogram(aes(x = data, y = ..density..), bins = 30, fill = "red", color = "black") +
  geom_line(aes(x = x, y = y), color = "green", size = 1) +
  labs(title = "Histogram and Fitted Normal Distribution",
       x = "Score",
       y = "Density") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(hist_plot)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

prob_exceed_100 <- 1 - pnorm(100, mean = mu, sd = sigma)
cat("i. Probability that 'score' exceeds 100 using the normal distribution:", prob_exceed_100, "\n")
## i. Probability that 'score' exceeds 100 using the normal distribution: 0.02044288
qqnorm(data)
qqline(data)

shapiro.test(data)
## 
##  Shapiro-Wilk normality test
## 
## data:  data
## W = 0.96973, p-value = 2.454e-12

##(II) Explain what is happening in the left tail of the histogram. Does the normal distribution fit well in the left tail?

cat("The normal distribution may not fit well in the left tail, as percentile values are bounded by values of 0 and 100, and normal distribution assumes unbounded tails.\n")
## The normal distribution may not fit well in the left tail, as percentile values are bounded by values of 0 and 100, and normal distribution assumes unbounded tails.

#Chapter 5 C1 ##Use the data in WAGEI for this exercise. ## (i) Estimate the equation: wage = Bo + Bjeduc + Brexper + Bstenure + u.

library(wooldridge)
library(ggplot2)
data("wage1")
wage_data <- wage1
model_level <- lm(wage ~ educ + exper + tenure, data = wage_data)
residuals_level <- residuals(model_level)
summary(model_level)
## 
## Call:
## lm(formula = wage ~ educ + exper + tenure, data = wage_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6068 -1.7747 -0.6279  1.1969 14.6536 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.87273    0.72896  -3.941 9.22e-05 ***
## educ         0.59897    0.05128  11.679  < 2e-16 ***
## exper        0.02234    0.01206   1.853   0.0645 .  
## tenure       0.16927    0.02164   7.820 2.93e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.084 on 522 degrees of freedom
## Multiple R-squared:  0.3064, Adjusted R-squared:  0.3024 
## F-statistic: 76.87 on 3 and 522 DF,  p-value: < 2.2e-16

##(II)Repeat part (i), but with log(wage) as the dependent variable.

model_log_level <- lm(log(wage) ~ educ + exper + tenure, data = wage_data)
residuals_log_level <- residuals(model_log_level)
hist(residuals_log_level, main = "Histogram of Residuals (Log-Level Model)", col = "green", border = "black")

summary(model_log_level)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure, data = wage_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.05802 -0.29645 -0.03265  0.28788  1.42809 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.284360   0.104190   2.729  0.00656 ** 
## educ        0.092029   0.007330  12.555  < 2e-16 ***
## exper       0.004121   0.001723   2.391  0.01714 *  
## tenure      0.022067   0.003094   7.133 3.29e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared:  0.316,  Adjusted R-squared:  0.3121 
## F-statistic: 80.39 on 3 and 522 DF,  p-value: < 2.2e-16

##(III)Would you say that Assumption MLR.6 is closer to being satisfied for the level-level model or the log-level model?

par(mfrow = c(2, 2))  # Set up a 2x2 grid for Q-Q plots

qqnorm(residuals_level, main = "Q-Q Plot - Level-Level Model")
qqline(residuals_level)

qqnorm(residuals_log_level, main = "Q-Q Plot - Log-Level Model")
qqline(residuals_log_level)
par(mfrow = c(1, 1))

cat("\nSummary Statistics - Level-Level Model:\n")
## 
## Summary Statistics - Level-Level Model:
summary(model_level)
## 
## Call:
## lm(formula = wage ~ educ + exper + tenure, data = wage_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6068 -1.7747 -0.6279  1.1969 14.6536 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.87273    0.72896  -3.941 9.22e-05 ***
## educ         0.59897    0.05128  11.679  < 2e-16 ***
## exper        0.02234    0.01206   1.853   0.0645 .  
## tenure       0.16927    0.02164   7.820 2.93e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.084 on 522 degrees of freedom
## Multiple R-squared:  0.3064, Adjusted R-squared:  0.3024 
## F-statistic: 76.87 on 3 and 522 DF,  p-value: < 2.2e-16
cat("\nSummary Statistics - Log-Level Model:\n")
## 
## Summary Statistics - Log-Level Model:
summary(model_log_level)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure, data = wage_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.05802 -0.29645 -0.03265  0.28788  1.42809 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.284360   0.104190   2.729  0.00656 ** 
## educ        0.092029   0.007330  12.555  < 2e-16 ***
## exper       0.004121   0.001723   2.391  0.01714 *  
## tenure      0.022067   0.003094   7.133 3.29e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared:  0.316,  Adjusted R-squared:  0.3121 
## F-statistic: 80.39 on 3 and 522 DF,  p-value: < 2.2e-16

##ANS:The residuals from the Log-Level model seem to posses a more normal distribution, in other words the Log-Level model is closer to satisfying assumption MLR.6.