library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(dslabs)
library(ISLR2)
library(matlib)
## Warning in rgl.init(initValue, onlyNULL): RGL: unable to open X11 display
## Warning: 'rgl.init' failed, running with 'rgl.useNULL = TRUE'.
library(wooldridge)
cat("\n====== Chapter 2 ======\n\n")
##
## ====== Chapter 2 ======
cat("\n====== C9 ======\n\n")
##
## ====== C9 ======
data("countymurders")
data1 <- countymurders
attach(data1)
## The following object is masked from package:dslabs:
##
## murders
data1 <- data1 %>% filter(year=="1996")
head(data1)
## arrests countyid density popul perc1019 perc2029 percblack percmale
## 1 8 1001 67.21535 40061 15.89077 13.17491 20.975510 48.70073
## 2 6 1003 77.05643 123023 13.93886 11.63929 13.496660 48.83233
## 3 1 1005 29.91548 26475 15.06327 13.69972 46.190750 49.15203
## 4 0 1009 67.20457 43392 14.17542 12.99318 1.415007 48.97446
## 5 1 1011 17.89899 11188 14.98927 14.13121 72.756520 49.91956
## 6 2 1013 27.71148 21530 15.68509 11.25871 41.384110 46.81839
## rpcincmaint rpcpersinc rpcunemins year murders murdrate arrestrate statefips
## 1 192.038 11852.760 26.796 1996 7 1.7473350 1.9969550 1
## 2 139.084 13583.020 28.710 1996 6 0.4877137 0.4877137 1
## 3 405.768 10760.510 63.162 1996 1 0.3777148 0.3777148 1
## 4 184.382 11094.820 21.692 1996 2 0.4609145 0.0000000 1
## 5 485.518 8349.506 63.162 1996 0 0.0000000 0.8938148 1
## 6 357.918 9947.058 54.868 1996 2 0.9289364 0.9289364 1
## countyfips execs lpopul execrate
## 1 1 0 10.598160 0
## 2 3 0 11.720130 0
## 3 5 0 10.183960 0
## 4 9 0 10.678030 0
## 5 11 0 9.322598 0
## 6 13 0 9.977202 0
nrow(data1[data1$murders==0,])
## [1] 1051
nrow(data1[data1$execs>=1,])
## [1] 31
max(execs)
## [1] 7
print("there are 1051 counties having no murders, 31 counties have at least one execution, and the maximum number for execution is 7.")
## [1] "there are 1051 counties having no murders, 31 counties have at least one execution, and the maximum number for execution is 7."
model_c9 <- lm(murders ~ execs)
summary(model_c9)
##
## Call:
## lm(formula = murders ~ execs)
##
## Residuals:
## Min 1Q Median 3Q Max
## -202.23 -6.84 -5.84 -3.84 1937.16
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.8382 0.2418 28.28 <2e-16 ***
## execs 65.4650 2.1463 30.50 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 46.64 on 37347 degrees of freedom
## Multiple R-squared: 0.02431, Adjusted R-squared: 0.02428
## F-statistic: 930.4 on 1 and 37347 DF, p-value: < 2.2e-16
plot(model_c9)




print(" murders = 6.8382 + 65.4650*execs N = 2197, R-squared = 0.0243.")
## [1] " murders = 6.8382 + 65.4650*execs N = 2197, R-squared = 0.0243."
print("Coefficient slope B1 = 65.4650, interpret that one unit increasing in execution will lead to 65.465 unit increasing in number of murders.")
## [1] "Coefficient slope B1 = 65.4650, interpret that one unit increasing in execution will lead to 65.465 unit increasing in number of murders."
print("Based on the fomular above, the smallest number of murders is equal to the constrant B0 (6.8382) when the value of execution = 0.")
## [1] "Based on the fomular above, the smallest number of murders is equal to the constrant B0 (6.8382) when the value of execution = 0."
print("A simple regression model is not well-suited for analyzing the deterrent effect of capital punishment based on county murder data in 1996 due to issues such as endogeneity, omitted variable bias, heterogeneity across counties, and the potential for nonlinear relationships. Advanced econometric techniques, accounting for these complexities, are more appropriate for a nuanced understanding of the relationship between capital punishment and murder rates.")
## [1] "A simple regression model is not well-suited for analyzing the deterrent effect of capital punishment based on county murder data in 1996 due to issues such as endogeneity, omitted variable bias, heterogeneity across counties, and the potential for nonlinear relationships. Advanced econometric techniques, accounting for these complexities, are more appropriate for a nuanced understanding of the relationship between capital punishment and murder rates."
cat("\n====== Chapter 3 ======\n\n")
##
## ====== Chapter 3 ======
cat("\n====== C5======\n\n")
##
## ====== C5======
print("i. No, because study = 168 - sleep - work - leisure.")
## [1] "i. No, because study = 168 - sleep - work - leisure."
print("ii.According to assumption MLR 3, in the sample (and therefore in the population), none of the independent variables is constant and there are no exact relationships among the independent variables. Since the sum of all independent variables is 168, there is an exect linear relationship between them. This violates the assumption MLR.3")
## [1] "ii.According to assumption MLR 3, in the sample (and therefore in the population), none of the independent variables is constant and there are no exact relationships among the independent variables. Since the sum of all independent variables is 168, there is an exect linear relationship between them. This violates the assumption MLR.3"
print("iii. Since student age is under 18 year-old, they are almost not working yet, therefore, we can eliminate the working variable out of the formula since its effect is logically insignificant.To reformulate the model so its parameters have a useful interpretation and satisfy MLR.3, we can drop 1 variable. For GPA, study, sleep, leisure can be more important. Hence, we drop work. The formula will be:
GPA= B0+ B1* study + B2* sleep + B3 * leisure+u")
## [1] "iii. Since student age is under 18 year-old, they are almost not working yet, therefore, we can eliminate the working variable out of the formula since its effect is logically insignificant.To reformulate the model so its parameters have a useful interpretation and satisfy MLR.3, we can drop 1 variable. For GPA, study, sleep, leisure can be more important. Hence, we drop work. The formula will be:\n\nGPA= B0+ B1* study + B2* sleep + B3 * leisure+u"
cat("\n====== C10 ======\n\n")
##
## ====== C10 ======
cat("*y = a0 + a1x1 + e1*\n*y = b0 + b1x1 + b2x2 + b3x3 + e2*")
## *y = a0 + a1x1 + e1*
## *y = b0 + b1x1 + b2x2 + b3x3 + e2*
print("i. the a1 = b1 + cov[(x1,x2); (x1; x3)]. There for, if x1 is highly related to x2 and x3, we will have the value of a1 is much greater than b1. If x1 is highly corelated with x2 and x3, and x2 and x3 have large partial effects on y, the regression coefficient of simple regression will be biased and smaller than the coefficient of multiple regression. This is because simple regression did not account for effects of x2 and x3, leading to omitted variable bias. By adding x2 and x3, the model can better isolate the ceteris paribus relationship between y and x1, providing a more accurate estimate of the true effect of x1 on y. Therefore, the coefficient for x1 in the multiple regression is likely to be larger than the coefficient in the simple regression due to the reduction in omitted variable bias.")
## [1] "i. the a1 = b1 + cov[(x1,x2); (x1; x3)]. There for, if x1 is highly related to x2 and x3, we will have the value of a1 is much greater than b1. If x1 is highly corelated with x2 and x3, and x2 and x3 have large partial effects on y, the regression coefficient of simple regression will be biased and smaller than the coefficient of multiple regression. This is because simple regression did not account for effects of x2 and x3, leading to omitted variable bias. By adding x2 and x3, the model can better isolate the ceteris paribus relationship between y and x1, providing a more accurate estimate of the true effect of x1 on y. Therefore, the coefficient for x1 in the multiple regression is likely to be larger than the coefficient in the simple regression due to the reduction in omitted variable bias."
print("ii. If x1 is uncorrelated with x2, x3 => the covariance between (x1,x2) and between (x1,x3) is insignificant, then a1 and b1 are the same.Because the relationship among x1,x2 and x3 cannot be treated separatedly. Since, the highly correlated between x2 and x3 still cause bias. Consquently, the result will be different.")
## [1] "ii. If x1 is uncorrelated with x2, x3 => the covariance between (x1,x2) and between (x1,x3) is insignificant, then a1 and b1 are the same.Because the relationship among x1,x2 and x3 cannot be treated separatedly. Since, the highly correlated between x2 and x3 still cause bias. Consquently, the result will be different."
print("iii. b1 will be smaller. If x1 is highly correlated with x2 and x3, but x2 and x3 have small partial effects on y, I expect se(B1) for simple regression will smaller than se(B1) for multiple regression. It is because the impacts of x2 and x3 are insignificant. Since, we can exclude them from the model with better results.
")
## [1] "iii. b1 will be smaller. If x1 is highly correlated with x2 and x3, but x2 and x3 have small partial effects on y, I expect se(B1) for simple regression will smaller than se(B1) for multiple regression. It is because the impacts of x2 and x3 are insignificant. Since, we can exclude them from the model with better results.\n\n"
print("iv. b1 will be smaller. If x1 is highly correlated with x2 and x3, and x2 and x3 have larg partial effects on y, I expect se(B1) for multiple regression will smaller than se(B1) for simple regression. It is because the impacts of x2 and x3 are significant; thus, excluding them from the model will cause bias.")
## [1] "iv. b1 will be smaller. If x1 is highly correlated with x2 and x3, and x2 and x3 have larg partial effects on y, I expect se(B1) for multiple regression will smaller than se(B1) for simple regression. It is because the impacts of x2 and x3 are significant; thus, excluding them from the model will cause bias."
cat("\n====== C8 ======\n\n")
##
## ====== C8 ======
data2 <- wooldridge::discrim
head(data2)
## psoda pfries pentree wagest nmgrs nregs hrsopen emp psoda2 pfries2 pentree2
## 1 1.12 1.06 1.02 4.25 3 5 16.0 27.5 1.11 1.11 1.05
## 2 1.06 0.91 0.95 4.75 3 3 16.5 21.5 1.05 0.89 0.95
## 3 1.06 0.91 0.98 4.25 3 5 18.0 30.0 1.05 0.94 0.98
## 4 1.12 1.02 1.06 5.00 4 5 16.0 27.5 1.15 1.05 1.05
## 5 1.12 NA 0.49 5.00 3 3 16.0 5.0 1.04 1.01 0.58
## 6 1.06 0.95 1.01 4.25 4 4 15.0 17.5 1.05 0.94 1.00
## wagest2 nmgrs2 nregs2 hrsopen2 emp2 compown chain density crmrte state
## 1 5.05 5 5 15.0 27.0 1 3 4030 0.0528866 1
## 2 5.05 4 3 17.5 24.5 0 1 4030 0.0528866 1
## 3 5.05 4 5 17.5 25.0 0 1 11400 0.0360003 1
## 4 5.05 4 5 16.0 NA 0 3 8345 0.0484232 1
## 5 5.05 3 3 16.0 12.0 0 1 720 0.0615890 1
## 6 5.05 3 4 15.0 28.0 0 1 4424 0.0334823 1
## prpblck prppov prpncar hseval nstores income county lpsoda
## 1 0.1711542 0.0365789 0.0788428 148300 3 44534 18 0.11332869
## 2 0.1711542 0.0365789 0.0788428 148300 3 44534 18 0.05826885
## 3 0.0473602 0.0879072 0.2694298 169200 3 41164 12 0.05826885
## 4 0.0528394 0.0591227 0.1366903 171600 3 50366 10 0.11332869
## 5 0.0344800 0.0254145 0.0738020 249100 1 72287 10 0.11332869
## 6 0.0591327 0.0835001 0.1151341 148000 2 44515 18 0.05826885
## lpfries lhseval lincome ldensity NJ BK KFC RR
## 1 0.05826885 11.90699 10.70401 8.301521 1 0 0 1
## 2 -0.09431065 11.90699 10.70401 8.301521 1 1 0 0
## 3 -0.09431065 12.03884 10.62532 9.341369 1 1 0 0
## 4 0.01980261 12.05292 10.82707 9.029418 1 0 0 1
## 5 NA 12.42561 11.18840 6.579251 1 1 0 0
## 6 -0.05129331 11.90497 10.70358 8.394799 1 1 0 0
data("discrim")
data2 <- discrim
attach(discrim)
## The following object is masked from data1:
##
## density
mean(prpblck, na.rm = T)
## [1] 0.1134864
mean(income, na.rm = T)
## [1] 47053.78
sd(prpblck, na.rm = T)
## [1] 0.1824165
sd(income, na.rm = T)
## [1] 13179.29
print("i. The averages for “prpblck” and “income” are 0.113 and 47,053, respectively. The standard deviations are likewise, 0.1824 and 13,179.29, respectively. It is apparent that prpblck represents a proportion of the black population, while income is represented in dollar terms.")
## [1] "i. The averages for “prpblck” and “income” are 0.113 and 47,053, respectively. The standard deviations are likewise, 0.1824 and 13,179.29, respectively. It is apparent that prpblck represents a proportion of the black population, while income is represented in dollar terms."
model_c8 <- lm(psoda~prpblck+income, data = discrim)
summary(model_c8)
##
## Call:
## lm(formula = psoda ~ prpblck + income, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29401 -0.05242 0.00333 0.04231 0.44322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.563e-01 1.899e-02 50.354 < 2e-16 ***
## prpblck 1.150e-01 2.600e-02 4.423 1.26e-05 ***
## income 1.603e-06 3.618e-07 4.430 1.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08611 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06422, Adjusted R-squared: 0.05952
## F-statistic: 13.66 on 2 and 398 DF, p-value: 1.835e-06
print("ii. The resulting regression is psoda.hat = (0.956) + (0.115)prpblck + (0.0000016). The optimal sample size is 399 observations (indicated by the 398 degrees of freedom and 9 missing observations) and the adjusted R^2 is 0.595. The coefficient on prpblck indicates that, all things being equal, if prpblck increases by 10% the price of soda will increase by approximately 1.2 cents, which is not economically significant.
")
## [1] "ii. The resulting regression is psoda.hat = (0.956) + (0.115)prpblck + (0.0000016). The optimal sample size is 399 observations (indicated by the 398 degrees of freedom and 9 missing observations) and the adjusted R^2 is 0.595. The coefficient on prpblck indicates that, all things being equal, if prpblck increases by 10% the price of soda will increase by approximately 1.2 cents, which is not economically significant.\n"
model_c8_1 <- lm(psoda~prpblck, data = discrim)
summary(model_c8_1)
##
## Call:
## lm(formula = psoda ~ prpblck, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.30884 -0.05963 0.01135 0.03206 0.44840
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.03740 0.00519 199.87 < 2e-16 ***
## prpblck 0.06493 0.02396 2.71 0.00702 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0881 on 399 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.01808, Adjusted R-squared: 0.01561
## F-statistic: 7.345 on 1 and 399 DF, p-value: 0.007015
print("iii. The estimate of the coefficient on prpblack with the simple regression is 0.065. This is lower than the prior estimate, and therefore shows that the discrimination effect decreases when income is excluded.")
## [1] "iii. The estimate of the coefficient on prpblack with the simple regression is 0.065. This is lower than the prior estimate, and therefore shows that the discrimination effect decreases when income is excluded."
model_c8_2 <- lm(log(psoda)~prpblck+log(income), data = discrim)
summary(model_c8_2)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.33563 -0.04695 0.00658 0.04334 0.35413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.79377 0.17943 -4.424 1.25e-05 ***
## prpblck 0.12158 0.02575 4.722 3.24e-06 ***
## log(income) 0.07651 0.01660 4.610 5.43e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0821 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06809, Adjusted R-squared: 0.06341
## F-statistic: 14.54 on 2 and 398 DF, p-value: 8.039e-07
paste( (0.2*100)*0.122, "percent increase")
## [1] "2.44 percent increase"
model_c8_3 <- lm(log(psoda)~prpblck+log(income)+prppov)
model_c8_3
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov)
##
## Coefficients:
## (Intercept) prpblck log(income) prppov
## -1.46333 0.07281 0.13696 0.38036
cor(log(discrim$income), discrim$prppov, method = "pearson", use = "complete.obs")
## [1] -0.838467
print("v. The correlation is approximately -0.838. This makes sense, because one would expect that declines in income would result in higher poverty rates.")
## [1] "v. The correlation is approximately -0.838. This makes sense, because one would expect that declines in income would result in higher poverty rates."
model <- lm(log(psoda)~prpblck+ log(income)+prppov, data=discrim)
summary(model)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = discrim)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32218 -0.04648 0.00651 0.04272 0.35622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.46333 0.29371 -4.982 9.4e-07 ***
## prpblck 0.07281 0.03068 2.373 0.0181 *
## log(income) 0.13696 0.02676 5.119 4.8e-07 ***
## prppov 0.38036 0.13279 2.864 0.0044 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08137 on 397 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.08696, Adjusted R-squared: 0.08006
## F-statistic: 12.6 on 3 and 397 DF, p-value: 6.917e-08
print("vi. The correlation is not as my expectation. It tells that proportion of poverty and price of soda is only 0.0259, meaning no relation. I expected the poorer the population, the less price we obtain.")
## [1] "vi. The correlation is not as my expectation. It tells that proportion of poverty and price of soda is only 0.0259, meaning no relation. I expected the poorer the population, the less price we obtain."
data8_6 <- discrim %>% select(psoda,prppov) %>% na.omit
cor(data8_6)
## psoda prppov
## psoda 1.00000000 0.02598077
## prppov 0.02598077 1.00000000
data8_7 <- discrim %>% select(income,prppov) %>% na.omit %>% mutate(income= log(income))
cor(data8_7)
## income prppov
## income 1.000000 -0.838467
## prppov -0.838467 1.000000
cat("\n====== Chapter 4 ======\n\n")
##
## ====== Chapter 4 ======
cat("\n====== C3 ======\n\n")
##
## ====== C3 ======
data("rdchem")
attach(rdchem)
paste(log(1.1)*0.321*100, "percentages change in rdintens ")
## [1] "3.05945677171883 percentages change in rdintens "
print("If log(sales) increases for 1 unit, rdintens increases for 0.321 unit. If sales increases for 10%, rdintens will increases for 3%, not a big change.")
## [1] "If log(sales) increases for 1 unit, rdintens increases for 0.321 unit. If sales increases for 10%, rdintens will increases for 3%, not a big change."
model_c3 <- lm(rdintens ~ log(sales) + profmarg)
summary(model_c3)
##
## Call:
## lm(formula = rdintens ~ log(sales) + profmarg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.3016 -1.2707 -0.6895 0.8785 6.0369
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.47225 1.67606 0.282 0.780
## log(sales) 0.32135 0.21557 1.491 0.147
## profmarg 0.05004 0.04578 1.093 0.283
##
## Residual standard error: 1.839 on 29 degrees of freedom
## Multiple R-squared: 0.09847, Adjusted R-squared: 0.0363
## F-statistic: 1.584 on 2 and 29 DF, p-value: 0.2224
print("ii. The 5% critical value for a one-tailedtest, with df = 32 − 3 = 29, is obtained from Table G.2 as 1.699; so we cannot reject H0 at the 5% level. But the 10% critical value is 1.311; since the t statistic is abovethis value, we reject H0 in favor of H1 at the 10% level.Hypothesis testing
rdintens= B0+ B1* log(sales)+ B2 *profmarg
We have H0: B1= 0 and H1: B1 # 0
t-value = 0.321/0.216=1.486111
With n=32, df= 31:
Critical value for 5% level of significance is c= 2.042
Critical value for 10% level of significance is c= 1.69
Because 1.486 < 1.69 < 2.042, we cannot reject H0. Consequently, we cannot conclude that increase in sales can increase RD intensity.")
## [1] "ii. The 5% critical value for a one-tailedtest, with df = 32 − 3 = 29, is obtained from Table G.2 as 1.699; so we cannot reject H0 at the 5% level. But the 10% critical value is 1.311; since the t statistic is abovethis value, we reject H0 in favor of H1 at the 10% level.Hypothesis testing\nrdintens= B0+ B1* log(sales)+ B2 *profmarg\n\nWe have H0: B1= 0 and H1: B1 # 0\n\nt-value = 0.321/0.216=1.486111\n\nWith n=32, df= 31:\n\nCritical value for 5% level of significance is c= 2.042\n\nCritical value for 10% level of significance is c= 1.69\n\nBecause 1.486 < 1.69 < 2.042, we cannot reject H0. Consequently, we cannot conclude that increase in sales can increase RD intensity."
print("iii. One percentage unit change in profmarg interprets 0.05 percentage unit change in rdintens. Economically that is quite significant, as given a10 % increase in profit margin then they will increase expenditures on R& D by 0.5 percentage point. Coef
For profmarg, 1% increase in profit margin will lead to 0.05% increase in RD intensity. It cannot be considered as economically large because the average is 2.5%. However, with a big firm, when they already have large scale, the increase in RD intensity at 0.05% is acceptable.
Hypothesis Testing
rdintens= B0+ B1* log(sales)+ B2 *profmarg
We have H0: B2= 0 and H1: B2 # 0
t-value = 0.050/0.046=1.086857
With n=32, df= 31:
Critical value for 5% level of significance is c= 2.042
Critical value for 10% level of significance is c= 1.69
Because 1.086957 < 1.69 < 2.042, we cannot reject H0. Consequently, we cannot conclude that increase in profit margin can increase RD intensity.")
## [1] "iii. One percentage unit change in profmarg interprets 0.05 percentage unit change in rdintens. Economically that is quite significant, as given a10 % increase in profit margin then they will increase expenditures on R& D by 0.5 percentage point. Coef\nFor profmarg, 1% increase in profit margin will lead to 0.05% increase in RD intensity. It cannot be considered as economically large because the average is 2.5%. However, with a big firm, when they already have large scale, the increase in RD intensity at 0.05% is acceptable.\n\nHypothesis Testing\nrdintens= B0+ B1* log(sales)+ B2 *profmarg\n\nWe have H0: B2= 0 and H1: B2 # 0\n\nt-value = 0.050/0.046=1.086857\n\nWith n=32, df= 31:\n\nCritical value for 5% level of significance is c= 2.042\n\nCritical value for 10% level of significance is c= 1.69\n\nBecause 1.086957 < 1.69 < 2.042, we cannot reject H0. Consequently, we cannot conclude that increase in profit margin can increase RD intensity."
cat("\n====== C8 ======\n\n")
##
## ====== C8 ======
data_c4 <- wooldridge::k401ksubs
head(data_c4,10)
## e401k inc marr male age fsize nettfa p401k pira incsq agesq
## 1 0 13.170 0 0 40 1 4.575 0 1 173.4489 1600
## 2 1 61.230 0 1 35 1 154.000 1 0 3749.1128 1225
## 3 0 12.858 1 0 44 2 0.000 0 0 165.3282 1936
## 4 0 98.880 1 1 44 2 21.800 0 0 9777.2539 1936
## 5 0 22.614 0 0 53 1 18.450 0 0 511.3930 2809
## 6 0 15.000 1 0 60 3 0.000 0 0 225.0000 3600
## 7 0 37.155 1 0 49 5 3.483 0 1 1380.4939 2401
## 8 0 31.896 1 0 38 5 -2.100 0 0 1017.3548 1444
## 9 0 47.295 1 0 52 2 5.290 0 1 2236.8169 2704
## 10 1 29.100 0 1 45 1 29.600 0 1 846.8100 2025
data("k401ksubs")
data4 <- k401ksubs %>% filter(fsize==1)
dim(data4)
## [1] 2017 11
attach(data4)
print("i. There are 2017 single-person households in the dataframe")
## [1] "i. There are 2017 single-person households in the dataframe"
model_c4 <- lm(nettfa ~ inc+ age)
summary(model_c4)
##
## Call:
## lm(formula = nettfa ~ inc + age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -179.95 -14.16 -3.42 6.03 1113.94
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -43.03981 4.08039 -10.548 <2e-16 ***
## inc 0.79932 0.05973 13.382 <2e-16 ***
## age 0.84266 0.09202 9.158 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 44.68 on 2014 degrees of freedom
## Multiple R-squared: 0.1193, Adjusted R-squared: 0.1185
## F-statistic: 136.5 on 2 and 2014 DF, p-value: < 2.2e-16
min(age)
## [1] 25
min(inc)
## [1] 10.008
print("ii. nettfa = -43.04 + 0.799inc + 0.842age (4.08) (.0597) (.0920) n = 2017, R^2 = .1185 One unit increasing in inc interprets .07993 unit increasing in nettfa One unit increasing in age interprets .84266 unit increasing in nettfa The model is significant, based on when age goes up, the income goes up and lead to the net financial wealth also go up.")
## [1] "ii. nettfa = -43.04 + 0.799inc + 0.842age (4.08) (.0597) (.0920) n = 2017, R^2 = .1185 One unit increasing in inc interprets .07993 unit increasing in nettfa One unit increasing in age interprets .84266 unit increasing in nettfa The model is significant, based on when age goes up, the income goes up and lead to the net financial wealth also go up."
income <- lm(inc ~ age)
income
##
## Call:
## lm(formula = inc ~ age)
##
## Coefficients:
## (Intercept) age
## 27.08278 0.06017
cov(log(inc), age)
## [1] 0.1907963
model <- lm(nettfa~inc, data= data_c4)
summary(model)
##
## Call:
## lm(formula = nettfa ~ inc, data = data_c4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -504.39 -18.10 -4.29 6.73 1475.04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -20.17948 1.17643 -17.15 <2e-16 ***
## inc 0.99991 0.02554 39.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 59.26 on 9273 degrees of freedom
## Multiple R-squared: 0.1418, Adjusted R-squared: 0.1417
## F-statistic: 1532 on 1 and 9273 DF, p-value: < 2.2e-16
cor(data_c4$age, data_c4$inc)
## [1] 0.1056378
cat("\n====== Chapter 5 ======\n\n")
##
## ====== Chapter 5 ======
cat("\n====== C5 ======\n\n")
##
## ====== C5 ======
data4 <- wooldridge::econmath
head(data4)
## age work study econhs colgpa hsgpa acteng actmth act mathscr male calculus
## 1 23 15 10.0 0 3.4909 3.355 24 26 27 10 1 1
## 2 23 0 22.5 1 2.1000 3.219 23 20 24 9 1 0
## 3 21 25 12.0 0 3.0851 3.306 21 24 21 8 1 1
## 4 22 30 40.0 0 2.6805 3.977 31 28 31 10 0 1
## 5 22 25 15.0 1 3.7454 3.890 28 31 32 8 1 1
## 6 22 0 30.0 0 3.0555 3.500 25 30 28 10 1 1
## attexc attgood fathcoll mothcoll score
## 1 0 0 1 1 84.43
## 2 0 0 0 1 57.38
## 3 1 0 0 1 66.39
## 4 0 1 1 1 81.15
## 5 0 1 0 1 95.90
## 6 1 0 0 1 83.61
print("i. If I use the normal distribution, probability that score exceeds 100 will not be zero because it should be symmetric. However, in real data, 100 is the highest score, hence my answer is contradict the assumption of normal distribution for score.")
## [1] "i. If I use the normal distribution, probability that score exceeds 100 will not be zero because it should be symmetric. However, in real data, 100 is the highest score, hence my answer is contradict the assumption of normal distribution for score."
print("ii.In the left tail of a histogram, the normal distribution might not fit well. The normal distribution assumes perfect symmetry and tails that extend infinitely, while real data might have asymmetry or limited tail extensions. Therefore, in the left tail, the normal distribution might not represent the actual data distribution accurately, especially if there’s skewness or outliers present")
## [1] "ii.In the left tail of a histogram, the normal distribution might not fit well. The normal distribution assumes perfect symmetry and tails that extend infinitely, while real data might have asymmetry or limited tail extensions. Therefore, in the left tail, the normal distribution might not represent the actual data distribution accurately, especially if there’s skewness or outliers present"
cat("\n====== C1 ======\n\n")
##
## ====== C1 ======
data4 <- wooldridge::wage1
head(data4)
## wage educ exper tenure nonwhite female married numdep smsa northcen south
## 1 3.10 11 2 0 0 1 0 2 1 0 0
## 2 3.24 12 22 2 0 1 1 3 1 0 0
## 3 3.00 11 2 0 0 0 0 2 0 0 0
## 4 6.00 8 44 28 0 0 1 0 1 0 0
## 5 5.30 12 7 2 0 0 1 1 0 0 0
## 6 8.75 16 9 8 0 0 1 0 1 0 0
## west construc ndurman trcommpu trade services profserv profocc clerocc
## 1 1 0 0 0 0 0 0 0 0
## 2 1 0 0 0 0 1 0 0 0
## 3 1 0 0 0 1 0 0 0 0
## 4 1 0 0 0 0 0 0 0 1
## 5 1 0 0 0 0 0 0 0 0
## 6 1 0 0 0 0 0 1 1 0
## servocc lwage expersq tenursq
## 1 0 1.131402 4 0
## 2 1 1.175573 484 4
## 3 0 1.098612 4 0
## 4 0 1.791759 1936 784
## 5 0 1.667707 49 4
## 6 0 2.169054 81 64
data("wage1")
attach(wage1)
model_wage <- lm( wage ~ educ + exper + tenure )
summary(model_wage)
##
## Call:
## lm(formula = wage ~ educ + exper + tenure)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6068 -1.7747 -0.6279 1.1969 14.6536
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.87273 0.72896 -3.941 9.22e-05 ***
## educ 0.59897 0.05128 11.679 < 2e-16 ***
## exper 0.02234 0.01206 1.853 0.0645 .
## tenure 0.16927 0.02164 7.820 2.93e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.084 on 522 degrees of freedom
## Multiple R-squared: 0.3064, Adjusted R-squared: 0.3024
## F-statistic: 76.87 on 3 and 522 DF, p-value: < 2.2e-16
hist(model_wage$residuals)

print("i. wage = −2.87 + .599 educ + .022 exper + .169 tenure (0.73) (.051) (.012) (.022) n = 526, R^2 = .306, σ = 3.085. The 526 residual histogram, with i = 1, 2,…, 526, is shown above. The formula in the Stata manual suggests using 27 bins for 526 observations, which is what the histogram does. The normal distribution that fits the histogram the best is also presented for comparison.")
## [1] "i. wage = −2.87 + .599 educ + .022 exper + .169 tenure (0.73) (.051) (.012) (.022) n = 526, R^2 = .306, σ = 3.085. The 526 residual histogram, with i = 1, 2,…, 526, is shown above. The formula in the Stata manual suggests using 27 bins for 526 observations, which is what the histogram does. The normal distribution that fits the histogram the best is also presented for comparison."
model_lg_wage <- lm(log(wage) ~ educ + exper + tenure)
summary(model_lg_wage)
##
## Call:
## lm(formula = log(wage) ~ educ + exper + tenure)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.05802 -0.29645 -0.03265 0.28788 1.42809
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.284360 0.104190 2.729 0.00656 **
## educ 0.092029 0.007330 12.555 < 2e-16 ***
## exper 0.004121 0.001723 2.391 0.01714 *
## tenure 0.022067 0.003094 7.133 3.29e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4409 on 522 degrees of freedom
## Multiple R-squared: 0.316, Adjusted R-squared: 0.3121
## F-statistic: 80.39 on 3 and 522 DF, p-value: < 2.2e-16
hist(model_lg_wage$residuals)

print("ii. log(wage) = .284 + .092 educ + .0041 exper + .022 tenure (.104) (.007) (.0017) (.003) n = 526, R2 = .316, σ = .441. The histogram for the residuals from this equation, with the best-fitting normal distribution overlaid, is given above.")
## [1] "ii. log(wage) = .284 + .092 educ + .0041 exper + .022 tenure (.104) (.007) (.0017) (.003) n = 526, R2 = .316, σ = .441. The histogram for the residuals from this equation, with the best-fitting normal distribution overlaid, is given above."
print("iii. The residuals derived from the log(wage) regression showcase a closer resemblance to a normal distribution. Specifically, when comparing the histograms between part (i) and part (ii), the histogram in part (ii) aligns more closely with the expected normal density. Besides, the histogram illustrating wage residuals displays a noticeable left skew. In the wage regression analysis, there exist certain exceptionally large residuals, approximately valued at 15, positioned nearly five estimated standard deviations (σ = 3.085) away from the mean of the residuals, which inherently is zero. However, these residuals far from zero seem to pose less of an issue in the log(wage) regression analysis.")
## [1] "iii. The residuals derived from the log(wage) regression showcase a closer resemblance to a normal distribution. Specifically, when comparing the histograms between part (i) and part (ii), the histogram in part (ii) aligns more closely with the expected normal density. Besides, the histogram illustrating wage residuals displays a noticeable left skew. In the wage regression analysis, there exist certain exceptionally large residuals, approximately valued at 15, positioned nearly five estimated standard deviations (σ = 3.085) away from the mean of the residuals, which inherently is zero. However, these residuals far from zero seem to pose less of an issue in the log(wage) regression analysis."
model6 <- lm(wage ~ educ + exper + tenure, data = data4)
summary(model6)
##
## Call:
## lm(formula = wage ~ educ + exper + tenure, data = data4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.6068 -1.7747 -0.6279 1.1969 14.6536
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.87273 0.72896 -3.941 9.22e-05 ***
## educ 0.59897 0.05128 11.679 < 2e-16 ***
## exper 0.02234 0.01206 1.853 0.0645 .
## tenure 0.16927 0.02164 7.820 2.93e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.084 on 522 degrees of freedom
## Multiple R-squared: 0.3064, Adjusted R-squared: 0.3024
## F-statistic: 76.87 on 3 and 522 DF, p-value: < 2.2e-16
plot(model6)



