library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(dslabs)
library(ISLR2)
library(matlib)
## Warning in rgl.init(initValue, onlyNULL): RGL: unable to open X11 display
## Warning: 'rgl.init' failed, running with 'rgl.useNULL = TRUE'.
library(wooldridge)
cat("\n====== Chapter 2 ======\n\n")
## 
## ====== Chapter 2 ======
cat("\n====== C9 ======\n\n")
## 
## ====== C9 ======
data("countymurders")
data1 <- countymurders
attach(data1)
## The following object is masked from package:dslabs:
## 
##     murders
data1 <- data1 %>% filter(year=="1996")
head(data1)
##   arrests countyid  density  popul perc1019 perc2029 percblack percmale
## 1       8     1001 67.21535  40061 15.89077 13.17491 20.975510 48.70073
## 2       6     1003 77.05643 123023 13.93886 11.63929 13.496660 48.83233
## 3       1     1005 29.91548  26475 15.06327 13.69972 46.190750 49.15203
## 4       0     1009 67.20457  43392 14.17542 12.99318  1.415007 48.97446
## 5       1     1011 17.89899  11188 14.98927 14.13121 72.756520 49.91956
## 6       2     1013 27.71148  21530 15.68509 11.25871 41.384110 46.81839
##   rpcincmaint rpcpersinc rpcunemins year murders  murdrate arrestrate statefips
## 1     192.038  11852.760     26.796 1996       7 1.7473350  1.9969550         1
## 2     139.084  13583.020     28.710 1996       6 0.4877137  0.4877137         1
## 3     405.768  10760.510     63.162 1996       1 0.3777148  0.3777148         1
## 4     184.382  11094.820     21.692 1996       2 0.4609145  0.0000000         1
## 5     485.518   8349.506     63.162 1996       0 0.0000000  0.8938148         1
## 6     357.918   9947.058     54.868 1996       2 0.9289364  0.9289364         1
##   countyfips execs    lpopul execrate
## 1          1     0 10.598160        0
## 2          3     0 11.720130        0
## 3          5     0 10.183960        0
## 4          9     0 10.678030        0
## 5         11     0  9.322598        0
## 6         13     0  9.977202        0
nrow(data1[data1$murders==0,])
## [1] 1051
nrow(data1[data1$execs>=1,])
## [1] 31
max(execs)
## [1] 7
print("there are 1051 counties having no murders, 31 counties have at least one execution, and the maximum number for execution is 7.")
## [1] "there are 1051 counties having no murders, 31 counties have at least one execution, and the maximum number for execution is 7."
model_c9 <- lm(murders ~ execs)
summary(model_c9)
## 
## Call:
## lm(formula = murders ~ execs)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -202.23   -6.84   -5.84   -3.84 1937.16 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   6.8382     0.2418   28.28   <2e-16 ***
## execs        65.4650     2.1463   30.50   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.64 on 37347 degrees of freedom
## Multiple R-squared:  0.02431,    Adjusted R-squared:  0.02428 
## F-statistic: 930.4 on 1 and 37347 DF,  p-value: < 2.2e-16
plot(model_c9)

print(" murders = 6.8382 + 65.4650*execs N = 2197, R-squared = 0.0243.")
## [1] " murders = 6.8382 + 65.4650*execs N = 2197, R-squared = 0.0243."
print("Coefficient slope B1 = 65.4650, interpret that one unit increasing in execution will lead to 65.465 unit increasing in number of murders.")
## [1] "Coefficient slope B1 = 65.4650, interpret that one unit increasing in execution will lead to 65.465 unit increasing in number of murders."
print("Based on the fomular above, the smallest number of murders is equal to the constrant B0 (6.8382) when the value of execution = 0.")
## [1] "Based on the fomular above, the smallest number of murders is equal to the constrant B0 (6.8382) when the value of execution = 0."
print("A simple regression model is not well-suited for analyzing the deterrent effect of capital punishment based on county murder data in 1996 due to issues such as endogeneity, omitted variable bias, heterogeneity across counties, and the potential for nonlinear relationships. Advanced econometric techniques, accounting for these complexities, are more appropriate for a nuanced understanding of the relationship between capital punishment and murder rates.")
## [1] "A simple regression model is not well-suited for analyzing the deterrent effect of capital punishment based on county murder data in 1996 due to issues such as endogeneity, omitted variable bias, heterogeneity across counties, and the potential for nonlinear relationships. Advanced econometric techniques, accounting for these complexities, are more appropriate for a nuanced understanding of the relationship between capital punishment and murder rates."
cat("\n====== Chapter 3 ======\n\n")
## 
## ====== Chapter 3 ======
cat("\n====== C5======\n\n")
## 
## ====== C5======
print("i. No, because study = 168 - sleep - work - leisure.")
## [1] "i. No, because study = 168 - sleep - work - leisure."
print("ii.According to assumption MLR 3, in the sample (and therefore in the population), none of the independent variables is constant and there are no exact relationships among the independent variables. Since the sum of all independent variables is 168, there is an exect linear relationship between them. This violates the assumption MLR.3")
## [1] "ii.According to assumption MLR 3, in the sample (and therefore in the population), none of the independent variables is constant and there are no exact relationships among the independent variables. Since the sum of all independent variables is 168, there is an exect linear relationship between them. This violates the assumption MLR.3"
print("iii. Since student age is under 18 year-old, they are almost not working yet, therefore, we can eliminate the working variable out of the formula since its effect is logically insignificant.To reformulate the model so its parameters have a useful interpretation and satisfy MLR.3, we can drop 1 variable. For GPA, study, sleep, leisure can be more important. Hence, we drop work. The formula will be:

GPA= B0+ B1* study + B2* sleep + B3 * leisure+u")
## [1] "iii. Since student age is under 18 year-old, they are almost not working yet, therefore, we can eliminate the working variable out of the formula since its effect is logically insignificant.To reformulate the model so its parameters have a useful interpretation and satisfy MLR.3, we can drop 1 variable. For GPA, study, sleep, leisure can be more important. Hence, we drop work. The formula will be:\n\nGPA= B0+ B1* study + B2* sleep + B3 * leisure+u"
cat("\n====== C10 ======\n\n")
## 
## ====== C10 ======
cat("*y = a0 + a1x1 + e1*\n*y = b0 + b1x1 + b2x2 + b3x3 + e2*")
## *y = a0 + a1x1 + e1*
## *y = b0 + b1x1 + b2x2 + b3x3 + e2*
print("i. the a1 = b1 + cov[(x1,x2); (x1; x3)]. There for, if x1 is highly related to x2 and x3, we will have the value of a1 is much greater than b1. If x1 is highly corelated with x2 and x3, and x2 and x3 have large partial effects on y, the regression coefficient of simple regression will be biased and smaller than the coefficient of multiple regression. This is because simple regression did not account for effects of x2 and x3, leading to omitted variable bias. By adding x2 and x3, the model can better isolate the ceteris paribus relationship between y and x1, providing a more accurate estimate of the true effect of x1 on y. Therefore, the coefficient for x1 in the multiple regression is likely to be larger than the coefficient in the simple regression due to the reduction in omitted variable bias.")
## [1] "i. the a1 = b1 + cov[(x1,x2); (x1; x3)]. There for, if x1 is highly related to x2 and x3, we will have the value of a1 is much greater than b1. If x1 is highly corelated with x2 and x3, and x2 and x3 have large partial effects on y, the regression coefficient of simple regression will be biased and smaller than the coefficient of multiple regression. This is because simple regression did not account for effects of x2 and x3, leading to omitted variable bias. By adding x2 and x3, the model can better isolate the ceteris paribus relationship between y and x1, providing a more accurate estimate of the true effect of x1 on y. Therefore, the coefficient for x1 in the multiple regression is likely to be larger than the coefficient in the simple regression due to the reduction in omitted variable bias."
print("ii. If x1 is uncorrelated with x2, x3 => the covariance between (x1,x2) and between (x1,x3) is insignificant, then a1 and b1 are the same.Because the relationship among x1,x2 and x3 cannot be treated separatedly. Since, the highly correlated between x2 and x3 still cause bias. Consquently, the result will be different.")
## [1] "ii. If x1 is uncorrelated with x2, x3 => the covariance between (x1,x2) and between (x1,x3) is insignificant, then a1 and b1 are the same.Because the relationship among x1,x2 and x3 cannot be treated separatedly. Since, the highly correlated between x2 and x3 still cause bias. Consquently, the result will be different."
print("iii. b1 will be smaller. If x1 is highly correlated with x2 and x3, but x2 and x3 have small partial effects on y, I expect se(B1) for simple regression will smaller than se(B1) for multiple regression. It is because the impacts of x2 and x3 are insignificant. Since, we can exclude them from the model with better results.

")
## [1] "iii. b1 will be smaller. If x1 is highly correlated with x2 and x3, but x2 and x3 have small partial effects on y, I expect se(B1) for simple regression will smaller than se(B1) for multiple regression. It is because the impacts of x2 and x3 are insignificant. Since, we can exclude them from the model with better results.\n\n"
print("iv. b1 will be smaller. If x1 is highly correlated with x2 and x3, and x2 and x3 have larg partial effects on y, I expect se(B1) for multiple regression will smaller than se(B1) for simple regression. It is because the impacts of x2 and x3 are significant; thus, excluding them from the model will cause bias.")
## [1] "iv. b1 will be smaller. If x1 is highly correlated with x2 and x3, and x2 and x3 have larg partial effects on y, I expect se(B1) for multiple regression will smaller than se(B1) for simple regression. It is because the impacts of x2 and x3 are significant; thus, excluding them from the model will cause bias."
cat("\n====== C8 ======\n\n")
## 
## ====== C8 ======
data2 <- wooldridge::discrim
head(data2)
##   psoda pfries pentree wagest nmgrs nregs hrsopen  emp psoda2 pfries2 pentree2
## 1  1.12   1.06    1.02   4.25     3     5    16.0 27.5   1.11    1.11     1.05
## 2  1.06   0.91    0.95   4.75     3     3    16.5 21.5   1.05    0.89     0.95
## 3  1.06   0.91    0.98   4.25     3     5    18.0 30.0   1.05    0.94     0.98
## 4  1.12   1.02    1.06   5.00     4     5    16.0 27.5   1.15    1.05     1.05
## 5  1.12     NA    0.49   5.00     3     3    16.0  5.0   1.04    1.01     0.58
## 6  1.06   0.95    1.01   4.25     4     4    15.0 17.5   1.05    0.94     1.00
##   wagest2 nmgrs2 nregs2 hrsopen2 emp2 compown chain density    crmrte state
## 1    5.05      5      5     15.0 27.0       1     3    4030 0.0528866     1
## 2    5.05      4      3     17.5 24.5       0     1    4030 0.0528866     1
## 3    5.05      4      5     17.5 25.0       0     1   11400 0.0360003     1
## 4    5.05      4      5     16.0   NA       0     3    8345 0.0484232     1
## 5    5.05      3      3     16.0 12.0       0     1     720 0.0615890     1
## 6    5.05      3      4     15.0 28.0       0     1    4424 0.0334823     1
##     prpblck    prppov   prpncar hseval nstores income county     lpsoda
## 1 0.1711542 0.0365789 0.0788428 148300       3  44534     18 0.11332869
## 2 0.1711542 0.0365789 0.0788428 148300       3  44534     18 0.05826885
## 3 0.0473602 0.0879072 0.2694298 169200       3  41164     12 0.05826885
## 4 0.0528394 0.0591227 0.1366903 171600       3  50366     10 0.11332869
## 5 0.0344800 0.0254145 0.0738020 249100       1  72287     10 0.11332869
## 6 0.0591327 0.0835001 0.1151341 148000       2  44515     18 0.05826885
##       lpfries  lhseval  lincome ldensity NJ BK KFC RR
## 1  0.05826885 11.90699 10.70401 8.301521  1  0   0  1
## 2 -0.09431065 11.90699 10.70401 8.301521  1  1   0  0
## 3 -0.09431065 12.03884 10.62532 9.341369  1  1   0  0
## 4  0.01980261 12.05292 10.82707 9.029418  1  0   0  1
## 5          NA 12.42561 11.18840 6.579251  1  1   0  0
## 6 -0.05129331 11.90497 10.70358 8.394799  1  1   0  0
data("discrim")
data2 <- discrim
attach(discrim)
## The following object is masked from data1:
## 
##     density
mean(prpblck, na.rm = T)
## [1] 0.1134864
mean(income, na.rm = T)
## [1] 47053.78
sd(prpblck, na.rm = T)
## [1] 0.1824165
sd(income, na.rm = T)
## [1] 13179.29
print("i. The averages for “prpblck” and “income” are 0.113 and 47,053, respectively. The standard deviations are likewise, 0.1824 and 13,179.29, respectively. It is apparent that prpblck represents a proportion of the black population, while income is represented in dollar terms.")
## [1] "i. The averages for “prpblck” and “income” are 0.113 and 47,053, respectively. The standard deviations are likewise, 0.1824 and 13,179.29, respectively. It is apparent that prpblck represents a proportion of the black population, while income is represented in dollar terms."
model_c8 <- lm(psoda~prpblck+income, data = discrim)
summary(model_c8)
## 
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.29401 -0.05242  0.00333  0.04231  0.44322 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 9.563e-01  1.899e-02  50.354  < 2e-16 ***
## prpblck     1.150e-01  2.600e-02   4.423 1.26e-05 ***
## income      1.603e-06  3.618e-07   4.430 1.22e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08611 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06422,    Adjusted R-squared:  0.05952 
## F-statistic: 13.66 on 2 and 398 DF,  p-value: 1.835e-06
print("ii. The resulting regression is psoda.hat = (0.956) + (0.115)prpblck + (0.0000016). The optimal sample size is 399 observations (indicated by the 398 degrees of freedom and 9 missing observations) and the adjusted R^2 is 0.595. The coefficient on prpblck indicates that, all things being equal, if prpblck increases by 10% the price of soda will increase by approximately 1.2 cents, which is not economically significant.
")
## [1] "ii. The resulting regression is psoda.hat = (0.956) + (0.115)prpblck + (0.0000016). The optimal sample size is 399 observations (indicated by the 398 degrees of freedom and 9 missing observations) and the adjusted R^2 is 0.595. The coefficient on prpblck indicates that, all things being equal, if prpblck increases by 10% the price of soda will increase by approximately 1.2 cents, which is not economically significant.\n"
model_c8_1 <- lm(psoda~prpblck, data = discrim)
summary(model_c8_1)
## 
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.30884 -0.05963  0.01135  0.03206  0.44840 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.03740    0.00519  199.87  < 2e-16 ***
## prpblck      0.06493    0.02396    2.71  0.00702 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0881 on 399 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.01808,    Adjusted R-squared:  0.01561 
## F-statistic: 7.345 on 1 and 399 DF,  p-value: 0.007015
print("iii. The estimate of the coefficient on prpblack with the simple regression is 0.065. This is lower than the prior estimate, and therefore shows that the discrimination effect decreases when income is excluded.")
## [1] "iii. The estimate of the coefficient on prpblack with the simple regression is 0.065. This is lower than the prior estimate, and therefore shows that the discrimination effect decreases when income is excluded."
model_c8_2 <- lm(log(psoda)~prpblck+log(income), data = discrim)
summary(model_c8_2)
## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33563 -0.04695  0.00658  0.04334  0.35413 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.79377    0.17943  -4.424 1.25e-05 ***
## prpblck      0.12158    0.02575   4.722 3.24e-06 ***
## log(income)  0.07651    0.01660   4.610 5.43e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0821 on 398 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.06809,    Adjusted R-squared:  0.06341 
## F-statistic: 14.54 on 2 and 398 DF,  p-value: 8.039e-07
paste( (0.2*100)*0.122, "percent increase")
## [1] "2.44 percent increase"
model_c8_3 <- lm(log(psoda)~prpblck+log(income)+prppov)
model_c8_3
## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov)
## 
## Coefficients:
## (Intercept)      prpblck  log(income)       prppov  
##    -1.46333      0.07281      0.13696      0.38036
cor(log(discrim$income), discrim$prppov, method = "pearson", use = "complete.obs")
## [1] -0.838467
print("v. The correlation is approximately -0.838. This makes sense, because one would expect that declines in income would result in higher poverty rates.")
## [1] "v. The correlation is approximately -0.838. This makes sense, because one would expect that declines in income would result in higher poverty rates."
model <- lm(log(psoda)~prpblck+ log(income)+prppov, data=discrim)
summary(model)
## 
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.32218 -0.04648  0.00651  0.04272  0.35622 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.46333    0.29371  -4.982  9.4e-07 ***
## prpblck      0.07281    0.03068   2.373   0.0181 *  
## log(income)  0.13696    0.02676   5.119  4.8e-07 ***
## prppov       0.38036    0.13279   2.864   0.0044 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.08137 on 397 degrees of freedom
##   (9 observations deleted due to missingness)
## Multiple R-squared:  0.08696,    Adjusted R-squared:  0.08006 
## F-statistic:  12.6 on 3 and 397 DF,  p-value: 6.917e-08
print("vi. The correlation is not as my expectation. It tells that proportion of poverty and price of soda is only 0.0259, meaning no relation. I expected the poorer the population, the less price we obtain.")
## [1] "vi. The correlation is not as my expectation. It tells that proportion of poverty and price of soda is only 0.0259, meaning no relation. I expected the poorer the population, the less price we obtain."
data8_6 <- discrim %>% select(psoda,prppov) %>% na.omit 
cor(data8_6)
##             psoda     prppov
## psoda  1.00000000 0.02598077
## prppov 0.02598077 1.00000000
data8_7 <- discrim %>% select(income,prppov) %>% na.omit %>% mutate(income= log(income))
cor(data8_7)
##           income    prppov
## income  1.000000 -0.838467
## prppov -0.838467  1.000000
cat("\n====== Chapter 4 ======\n\n")
## 
## ====== Chapter 4 ======
cat("\n====== C3 ======\n\n")
## 
## ====== C3 ======
data("rdchem")
attach(rdchem)
paste(log(1.1)*0.321*100, "percentages change in rdintens ")
## [1] "3.05945677171883 percentages change in rdintens "
print("If log(sales) increases for 1 unit, rdintens increases for 0.321 unit. If sales increases for 10%, rdintens will increases for 3%, not a big change.")
## [1] "If log(sales) increases for 1 unit, rdintens increases for 0.321 unit. If sales increases for 10%, rdintens will increases for 3%, not a big change."
model_c3 <- lm(rdintens ~ log(sales) + profmarg)
summary(model_c3)
## 
## Call:
## lm(formula = rdintens ~ log(sales) + profmarg)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3016 -1.2707 -0.6895  0.8785  6.0369 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  0.47225    1.67606   0.282    0.780
## log(sales)   0.32135    0.21557   1.491    0.147
## profmarg     0.05004    0.04578   1.093    0.283
## 
## Residual standard error: 1.839 on 29 degrees of freedom
## Multiple R-squared:  0.09847,    Adjusted R-squared:  0.0363 
## F-statistic: 1.584 on 2 and 29 DF,  p-value: 0.2224
print("ii. The 5% critical value for a one-tailedtest, with df = 32 − 3 = 29, is obtained from Table G.2 as 1.699; so we cannot reject H0 at the 5% level. But the 10% critical value is 1.311; since the t statistic is abovethis value, we reject H0 in favor of H1 at the 10% level.Hypothesis testing
rdintens= B0+ B1* log(sales)+ B2 *profmarg

We have H0: B1= 0 and H1: B1 # 0

t-value = 0.321/0.216=1.486111

With n=32, df= 31:

Critical value for 5% level of significance is c= 2.042

Critical value for 10% level of significance is c= 1.69

Because 1.486 < 1.69 < 2.042, we cannot reject H0. Consequently, we cannot conclude that increase in sales can increase RD intensity.")
## [1] "ii. The 5% critical value for a one-tailedtest, with df = 32 − 3 = 29, is obtained from Table G.2 as 1.699; so we cannot reject H0 at the 5% level. But the 10% critical value is 1.311; since the t statistic is abovethis value, we reject H0 in favor of H1 at the 10% level.Hypothesis testing\nrdintens= B0+ B1* log(sales)+ B2 *profmarg\n\nWe have H0: B1= 0 and H1: B1 # 0\n\nt-value = 0.321/0.216=1.486111\n\nWith n=32, df= 31:\n\nCritical value for 5% level of significance is c= 2.042\n\nCritical value for 10% level of significance is c= 1.69\n\nBecause 1.486 < 1.69 < 2.042, we cannot reject H0. Consequently, we cannot conclude that increase in sales can increase RD intensity."
print("iii. One percentage unit change in profmarg interprets 0.05 percentage unit change in rdintens. Economically that is quite significant, as given a10 % increase in profit margin then they will increase expenditures on R& D by 0.5 percentage point. Coef
For profmarg, 1% increase in profit margin will lead to 0.05% increase in RD intensity. It cannot be considered as economically large because the average is 2.5%. However, with a big firm, when they already have large scale, the increase in RD intensity at 0.05% is acceptable.

Hypothesis Testing
rdintens= B0+ B1* log(sales)+ B2 *profmarg

We have H0: B2= 0 and H1: B2 # 0

t-value = 0.050/0.046=1.086857

With n=32, df= 31:

Critical value for 5% level of significance is c= 2.042

Critical value for 10% level of significance is c= 1.69

Because 1.086957 < 1.69 < 2.042, we cannot reject H0. Consequently, we cannot conclude that increase in profit margin can increase RD intensity.")
## [1] "iii. One percentage unit change in profmarg interprets 0.05 percentage unit change in rdintens. Economically that is quite significant, as given a10 % increase in profit margin then they will increase expenditures on R& D by 0.5 percentage point. Coef\nFor profmarg, 1% increase in profit margin will lead to 0.05% increase in RD intensity. It cannot be considered as economically large because the average is 2.5%. However, with a big firm, when they already have large scale, the increase in RD intensity at 0.05% is acceptable.\n\nHypothesis Testing\nrdintens= B0+ B1* log(sales)+ B2 *profmarg\n\nWe have H0: B2= 0 and H1: B2 # 0\n\nt-value = 0.050/0.046=1.086857\n\nWith n=32, df= 31:\n\nCritical value for 5% level of significance is c= 2.042\n\nCritical value for 10% level of significance is c= 1.69\n\nBecause 1.086957 < 1.69 < 2.042, we cannot reject H0. Consequently, we cannot conclude that increase in profit margin can increase RD intensity."
cat("\n====== C8 ======\n\n")
## 
## ====== C8 ======
data_c4 <- wooldridge::k401ksubs
head(data_c4,10)
##    e401k    inc marr male age fsize  nettfa p401k pira     incsq agesq
## 1      0 13.170    0    0  40     1   4.575     0    1  173.4489  1600
## 2      1 61.230    0    1  35     1 154.000     1    0 3749.1128  1225
## 3      0 12.858    1    0  44     2   0.000     0    0  165.3282  1936
## 4      0 98.880    1    1  44     2  21.800     0    0 9777.2539  1936
## 5      0 22.614    0    0  53     1  18.450     0    0  511.3930  2809
## 6      0 15.000    1    0  60     3   0.000     0    0  225.0000  3600
## 7      0 37.155    1    0  49     5   3.483     0    1 1380.4939  2401
## 8      0 31.896    1    0  38     5  -2.100     0    0 1017.3548  1444
## 9      0 47.295    1    0  52     2   5.290     0    1 2236.8169  2704
## 10     1 29.100    0    1  45     1  29.600     0    1  846.8100  2025
data("k401ksubs")
data4 <- k401ksubs %>% filter(fsize==1)
dim(data4)
## [1] 2017   11
attach(data4)
print("i. There are 2017 single-person households in the dataframe")
## [1] "i. There are 2017 single-person households in the dataframe"
model_c4 <- lm(nettfa ~ inc+ age)
summary(model_c4)
## 
## Call:
## lm(formula = nettfa ~ inc + age)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -179.95  -14.16   -3.42    6.03 1113.94 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -43.03981    4.08039 -10.548   <2e-16 ***
## inc           0.79932    0.05973  13.382   <2e-16 ***
## age           0.84266    0.09202   9.158   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 44.68 on 2014 degrees of freedom
## Multiple R-squared:  0.1193, Adjusted R-squared:  0.1185 
## F-statistic: 136.5 on 2 and 2014 DF,  p-value: < 2.2e-16
min(age)
## [1] 25
min(inc)
## [1] 10.008
print("ii. nettfa = -43.04 + 0.799inc + 0.842age (4.08) (.0597) (.0920) n = 2017, R^2 = .1185 One unit increasing in inc interprets .07993 unit increasing in nettfa One unit increasing in age interprets .84266 unit increasing in nettfa The model is significant, based on when age goes up, the income goes up and lead to the net financial wealth also go up.")
## [1] "ii. nettfa = -43.04 + 0.799inc + 0.842age (4.08) (.0597) (.0920) n = 2017, R^2 = .1185 One unit increasing in inc interprets .07993 unit increasing in nettfa One unit increasing in age interprets .84266 unit increasing in nettfa The model is significant, based on when age goes up, the income goes up and lead to the net financial wealth also go up."
income <- lm(inc ~ age)
income
## 
## Call:
## lm(formula = inc ~ age)
## 
## Coefficients:
## (Intercept)          age  
##    27.08278      0.06017
cov(log(inc), age)
## [1] 0.1907963
model <- lm(nettfa~inc, data= data_c4)
summary(model)
## 
## Call:
## lm(formula = nettfa ~ inc, data = data_c4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -504.39  -18.10   -4.29    6.73 1475.04 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -20.17948    1.17643  -17.15   <2e-16 ***
## inc           0.99991    0.02554   39.15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 59.26 on 9273 degrees of freedom
## Multiple R-squared:  0.1418, Adjusted R-squared:  0.1417 
## F-statistic:  1532 on 1 and 9273 DF,  p-value: < 2.2e-16
cor(data_c4$age, data_c4$inc)
## [1] 0.1056378
cat("\n====== Chapter 5 ======\n\n")
## 
## ====== Chapter 5 ======
cat("\n====== C5 ======\n\n")
## 
## ====== C5 ======
data4 <- wooldridge::econmath
head(data4)
##   age work study econhs colgpa hsgpa acteng actmth act mathscr male calculus
## 1  23   15  10.0      0 3.4909 3.355     24     26  27      10    1        1
## 2  23    0  22.5      1 2.1000 3.219     23     20  24       9    1        0
## 3  21   25  12.0      0 3.0851 3.306     21     24  21       8    1        1
## 4  22   30  40.0      0 2.6805 3.977     31     28  31      10    0        1
## 5  22   25  15.0      1 3.7454 3.890     28     31  32       8    1        1
## 6  22    0  30.0      0 3.0555 3.500     25     30  28      10    1        1
##   attexc attgood fathcoll mothcoll score
## 1      0       0        1        1 84.43
## 2      0       0        0        1 57.38
## 3      1       0        0        1 66.39
## 4      0       1        1        1 81.15
## 5      0       1        0        1 95.90
## 6      1       0        0        1 83.61
print("i. If I use the normal distribution, probability that score exceeds 100 will not be zero because it should be symmetric. However, in real data, 100 is the highest score, hence my answer is contradict the assumption of normal distribution for score.")
## [1] "i. If I use the normal distribution, probability that score exceeds 100 will not be zero because it should be symmetric. However, in real data, 100 is the highest score, hence my answer is contradict the assumption of normal distribution for score."
print("ii.In the left tail of a histogram, the normal distribution might not fit well. The normal distribution assumes perfect symmetry and tails that extend infinitely, while real data might have asymmetry or limited tail extensions. Therefore, in the left tail, the normal distribution might not represent the actual data distribution accurately, especially if there’s skewness or outliers present")
## [1] "ii.In the left tail of a histogram, the normal distribution might not fit well. The normal distribution assumes perfect symmetry and tails that extend infinitely, while real data might have asymmetry or limited tail extensions. Therefore, in the left tail, the normal distribution might not represent the actual data distribution accurately, especially if there’s skewness or outliers present"
cat("\n====== C1 ======\n\n")
## 
## ====== C1 ======
data4 <- wooldridge::wage1
head(data4)
##   wage educ exper tenure nonwhite female married numdep smsa northcen south
## 1 3.10   11     2      0        0      1       0      2    1        0     0
## 2 3.24   12    22      2        0      1       1      3    1        0     0
## 3 3.00   11     2      0        0      0       0      2    0        0     0
## 4 6.00    8    44     28        0      0       1      0    1        0     0
## 5 5.30   12     7      2        0      0       1      1    0        0     0
## 6 8.75   16     9      8        0      0       1      0    1        0     0
##   west construc ndurman trcommpu trade services profserv profocc clerocc
## 1    1        0       0        0     0        0        0       0       0
## 2    1        0       0        0     0        1        0       0       0
## 3    1        0       0        0     1        0        0       0       0
## 4    1        0       0        0     0        0        0       0       1
## 5    1        0       0        0     0        0        0       0       0
## 6    1        0       0        0     0        0        1       1       0
##   servocc    lwage expersq tenursq
## 1       0 1.131402       4       0
## 2       1 1.175573     484       4
## 3       0 1.098612       4       0
## 4       0 1.791759    1936     784
## 5       0 1.667707      49       4
## 6       0 2.169054      81      64
data("wage1")
attach(wage1)
model_wage <- lm( wage ~ educ + exper + tenure )
summary(model_wage)
## 
## Call:
## lm(formula = wage ~ educ + exper + tenure)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6068 -1.7747 -0.6279  1.1969 14.6536 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.87273    0.72896  -3.941 9.22e-05 ***
## educ         0.59897    0.05128  11.679  < 2e-16 ***
## exper        0.02234    0.01206   1.853   0.0645 .  
## tenure       0.16927    0.02164   7.820 2.93e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.084 on 522 degrees of freedom
## Multiple R-squared:  0.3064, Adjusted R-squared:  0.3024 
## F-statistic: 76.87 on 3 and 522 DF,  p-value: < 2.2e-16
hist(model_wage$residuals)

print("i. wage = −2.87 + .599 educ + .022 exper + .169 tenure (0.73) (.051) (.012) (.022) n = 526, R^2 = .306, σ = 3.085. The 526 residual histogram, with i = 1, 2,…, 526, is shown above. The formula in the Stata manual suggests using 27 bins for 526 observations, which is what the histogram does. The normal distribution that fits the histogram the best is also presented for comparison.")
## [1] "i. wage = −2.87 + .599 educ + .022 exper + .169 tenure (0.73) (.051) (.012) (.022) n = 526, R^2 = .306, σ = 3.085. The 526 residual histogram, with i = 1, 2,…, 526, is shown above. The formula in the Stata manual suggests using 27 bins for 526 observations, which is what the histogram does. The normal distribution that fits the histogram the best is also presented for comparison."
model_lg_wage <- lm(log(wage)  ~ educ + exper + tenure)
summary(model_lg_wage)
## 
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.05802 -0.29645 -0.03265  0.28788  1.42809 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.284360   0.104190   2.729  0.00656 ** 
## educ        0.092029   0.007330  12.555  < 2e-16 ***
## exper       0.004121   0.001723   2.391  0.01714 *  
## tenure      0.022067   0.003094   7.133 3.29e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared:  0.316,  Adjusted R-squared:  0.3121 
## F-statistic: 80.39 on 3 and 522 DF,  p-value: < 2.2e-16
hist(model_lg_wage$residuals)

print("ii. log(wage) = .284 + .092 educ + .0041 exper + .022 tenure (.104) (.007) (.0017) (.003) n = 526, R2 = .316, σ = .441. The histogram for the residuals from this equation, with the best-fitting normal distribution overlaid, is given above.")
## [1] "ii. log(wage) = .284 + .092 educ + .0041 exper + .022 tenure (.104) (.007) (.0017) (.003) n = 526, R2 = .316, σ = .441. The histogram for the residuals from this equation, with the best-fitting normal distribution overlaid, is given above."
print("iii. The residuals derived from the log(wage) regression showcase a closer resemblance to a normal distribution. Specifically, when comparing the histograms between part (i) and part (ii), the histogram in part (ii) aligns more closely with the expected normal density. Besides, the histogram illustrating wage residuals displays a noticeable left skew. In the wage regression analysis, there exist certain exceptionally large residuals, approximately valued at 15, positioned nearly five estimated standard deviations (σ = 3.085) away from the mean of the residuals, which inherently is zero. However, these residuals far from zero seem to pose less of an issue in the log(wage) regression analysis.")
## [1] "iii. The residuals derived from the log(wage) regression showcase a closer resemblance to a normal distribution. Specifically, when comparing the histograms between part (i) and part (ii), the histogram in part (ii) aligns more closely with the expected normal density. Besides, the histogram illustrating wage residuals displays a noticeable left skew. In the wage regression analysis, there exist certain exceptionally large residuals, approximately valued at 15, positioned nearly five estimated standard deviations (σ = 3.085) away from the mean of the residuals, which inherently is zero. However, these residuals far from zero seem to pose less of an issue in the log(wage) regression analysis."
model6 <- lm(wage ~ educ + exper + tenure, data = data4)
summary(model6)
## 
## Call:
## lm(formula = wage ~ educ + exper + tenure, data = data4)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -7.6068 -1.7747 -0.6279  1.1969 14.6536 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -2.87273    0.72896  -3.941 9.22e-05 ***
## educ         0.59897    0.05128  11.679  < 2e-16 ***
## exper        0.02234    0.01206   1.853   0.0645 .  
## tenure       0.16927    0.02164   7.820 2.93e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.084 on 522 degrees of freedom
## Multiple R-squared:  0.3064, Adjusted R-squared:  0.3024 
## F-statistic: 76.87 on 3 and 522 DF,  p-value: < 2.2e-16
plot(model6)