library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(wooldridge)
data<-wooldridge::bwght
library(wooldridge)
Datanew<-wooldridge::wage2
attach(Datanew)
mean(wage)
## [1] 957.9455
mean(wage)
## [1] 957.9455
mean(IQ)
## [1] 101.2824
sd(IQ)
## [1] 15.05264
modelnew<-lm(wage~IQ)
summary(Datanew)
## wage hours IQ KWW
## Min. : 115.0 Min. :20.00 Min. : 50.0 Min. :12.00
## 1st Qu.: 669.0 1st Qu.:40.00 1st Qu.: 92.0 1st Qu.:31.00
## Median : 905.0 Median :40.00 Median :102.0 Median :37.00
## Mean : 957.9 Mean :43.93 Mean :101.3 Mean :35.74
## 3rd Qu.:1160.0 3rd Qu.:48.00 3rd Qu.:112.0 3rd Qu.:41.00
## Max. :3078.0 Max. :80.00 Max. :145.0 Max. :56.00
##
## educ exper tenure age
## Min. : 9.00 Min. : 1.00 Min. : 0.000 Min. :28.00
## 1st Qu.:12.00 1st Qu.: 8.00 1st Qu.: 3.000 1st Qu.:30.00
## Median :12.00 Median :11.00 Median : 7.000 Median :33.00
## Mean :13.47 Mean :11.56 Mean : 7.234 Mean :33.08
## 3rd Qu.:16.00 3rd Qu.:15.00 3rd Qu.:11.000 3rd Qu.:36.00
## Max. :18.00 Max. :23.00 Max. :22.000 Max. :38.00
##
## married black south urban
## Min. :0.000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :1.000 Median :0.0000 Median :0.0000 Median :1.0000
## Mean :0.893 Mean :0.1283 Mean :0.3412 Mean :0.7176
## 3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.000 Max. :1.0000 Max. :1.0000 Max. :1.0000
##
## sibs brthord meduc feduc
## Min. : 0.000 Min. : 1.000 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 1.000 1st Qu.: 8.00 1st Qu.: 8.00
## Median : 2.000 Median : 2.000 Median :12.00 Median :10.00
## Mean : 2.941 Mean : 2.277 Mean :10.68 Mean :10.22
## 3rd Qu.: 4.000 3rd Qu.: 3.000 3rd Qu.:12.00 3rd Qu.:12.00
## Max. :14.000 Max. :10.000 Max. :18.00 Max. :18.00
## NA's :83 NA's :78 NA's :194
## lwage
## Min. :4.745
## 1st Qu.:6.506
## Median :6.808
## Mean :6.779
## 3rd Qu.:7.056
## Max. :8.032
##
rm(Datanew, model1, modelnew)
## Warning in rm(Datanew, model1, modelnew): object 'model1' not found
ii. Estimate a simple regression model where a one-point increase in
IQ changes wage by a constant dollar amount. Use this model to find the
predicted increase in wage for an increase in IQ of 15 points. Does IQ
explain most of the variation in wage?
library(wooldridge)
Data4<-wooldridge::wage2
attach(Data4)
## The following objects are masked from Datanew:
##
## age, black, brthord, educ, exper, feduc, hours, IQ, KWW, lwage,
## married, meduc, sibs, south, tenure, urban, wage
mean("wage")
## Warning in mean.default("wage"): argument is not numeric or logical: returning
## NA
## [1] NA
mean("IQ")
## Warning in mean.default("IQ"): argument is not numeric or logical: returning NA
## [1] NA
sd("IQ")
## Warning in var(if (is.vector(x) || is.factor(x)) x else as.double(x), na.rm =
## na.rm): NAs introduced by coercion
## [1] NA
model<-lm(wage~IQ)
summary(model)
##
## Call:
## lm(formula = wage ~ IQ)
##
## Residuals:
## Min 1Q Median 3Q Max
## -898.7 -256.5 -47.3 201.1 2072.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 116.9916 85.6415 1.366 0.172
## IQ 8.3031 0.8364 9.927 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 384.8 on 933 degrees of freedom
## Multiple R-squared: 0.09554, Adjusted R-squared: 0.09457
## F-statistic: 98.55 on 1 and 933 DF, p-value: < 2.2e-16
Now, estimate a model where each one-point increase in IQ has the
same percentage effect on wage. If IQ increases by 15 points, what is
the approximate percentage increase in predicted wage?
Answer : Wage increases by 124.5 points for every 15 point rise in
IQ.Also, after running the linear regression command in r-studio, we
receive R2=0.094. Wage variance is explained by IQ variation just 9.4%
of the time. As a result, IQ variance does not explain the majority of
wage variation.
model <- lm(IQ~educ, data= data)
summary(model)
##
## Call:
## lm(formula = IQ ~ educ, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.228 -7.262 0.907 8.772 37.373
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.6872 2.6229 20.47 <2e-16 ***
## educ 3.5338 0.1922 18.39 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.9 on 933 degrees of freedom
## Multiple R-squared: 0.2659, Adjusted R-squared: 0.2652
## F-statistic: 338 on 1 and 933 DF, p-value: < 2.2e-16
theta1= model$coefficients[2]
ii.wage ~ educ The coef is 0.059839, meaning that 1 year increase in
education resulting in 0.06 USD increase in log(wage)
model <- lm(log(wage)~educ, data= data)
summary(model)
##
## Call:
## lm(formula = log(wage) ~ educ, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.94620 -0.24832 0.03507 0.27440 1.28106
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.973062 0.081374 73.40 <2e-16 ***
## educ 0.059839 0.005963 10.04 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4003 on 933 degrees of freedom
## Multiple R-squared: 0.09742, Adjusted R-squared: 0.09645
## F-statistic: 100.7 on 1 and 933 DF, p-value: < 2.2e-16
beta1= model$coefficients[2]
iii. wage ~ educ+iq The coef are 0.0391199 and 0.0058631 for educ
and IQ respectively. It indicates that if the years of education do not
change, 1 point increase in IQ resulting in 0.006 USD increase in the
log(wage). Besidesm if the IQ level is fixed, 1 year increase in
education resulting in 0.039 USD increase in the log(wage).
model <- lm(log(wage)~educ+IQ, data= data)
summary(model)
##
## Call:
## lm(formula = log(wage) ~ educ + IQ, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.01601 -0.24367 0.03359 0.27960 1.23783
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.6582876 0.0962408 58.793 < 2e-16 ***
## educ 0.0391199 0.0068382 5.721 1.43e-08 ***
## IQ 0.0058631 0.0009979 5.875 5.87e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3933 on 932 degrees of freedom
## Multiple R-squared: 0.1297, Adjusted R-squared: 0.1278
## F-statistic: 69.42 on 2 and 932 DF, p-value: < 2.2e-16
beta1_2= model$coefficients[2]
beta2= model$coefficients[3]
iv.The verification Due to the error term we don’t have exactly
substraction equal to 0, the result near 0 at -0.02
beta1_2+beta2*theta1 - beta1
## educ
## 0
C8 i. Mean and standard deviation The unit of measurement in income
is USD while that for prpblck is the percentage
data8 <- discrim
print(paste("The mean of income will be:", mean(data8$income, na.rm=TRUE)))
## [1] "The mean of income will be: 47053.7848410758"
print(paste("The standard deviation of income will be:", sd(data8$income, na.rm=TRUE)))
## [1] "The standard deviation of income will be: 13179.2860689389"
print(paste("The mean of people black will be:", mean(data8$prpblck,na.rm=TRUE)))
## [1] "The mean of people black will be: 0.113486396497833"
print(paste("The standard deviation of people black will be:", sd(data8$prpblck, na.rm=TRUE)))
## [1] "The standard deviation of people black will be: 0.182416467486231"
ii. psoda~ prpblck+income The coef of prpblck is 0.0115, meaning
that if we increase black people by 1% in zipcode, the price of medium
soda will increase by 0.0115 USD. It is not economically lag.
sample_size <- data8$psoda %>% na.omit %>% as.data.frame()
sample_size <- nrow(sample_size)
model <- lm(psoda~ prpblck+income, data=data8)
summary <- summary(model)
summary
##
## Call:
## lm(formula = psoda ~ prpblck + income, data = data8)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29401 -0.05242 0.00333 0.04231 0.44322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.563e-01 1.899e-02 50.354 < 2e-16 ***
## prpblck 1.150e-01 2.600e-02 4.423 1.26e-05 ***
## income 1.603e-06 3.618e-07 4.430 1.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08611 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06422, Adjusted R-squared: 0.05952
## F-statistic: 13.66 on 2 and 398 DF, p-value: 1.835e-06
print(paste("The sample size is:",sample_size))
## [1] "The sample size is: 402"
print(paste("The Rsquared is:",summary$r.squared))
## [1] "The Rsquared is: 0.0642203910903628"
iii. psoda ~ prpblck The coef is 0.0649. Hence, the effect is
smaller when we control for income
model <- lm(psoda~prpblck, data=data8)
summary(model)
##
## Call:
## lm(formula = psoda ~ prpblck, data = data8)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.30884 -0.05963 0.01135 0.03206 0.44840
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.03740 0.00519 199.87 < 2e-16 ***
## prpblck 0.06493 0.02396 2.71 0.00702 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0881 on 399 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.01808, Adjusted R-squared: 0.01561
## F-statistic: 7.345 on 1 and 399 DF, p-value: 0.007015
iv. log(psoda) ~ prpblck ~ log(income) The coef is 0.1216. Hence,
the effect is smaller when we control for income.
model <- lm(log(psoda)~prpblck+ log(income), data=data8)
summary(model)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income), data = data8)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.33563 -0.04695 0.00658 0.04334 0.35413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.79377 0.17943 -4.424 1.25e-05 ***
## prpblck 0.12158 0.02575 4.722 3.24e-06 ***
## log(income) 0.07651 0.01660 4.610 5.43e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0821 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06809, Adjusted R-squared: 0.06341
## F-statistic: 14.54 on 2 and 398 DF, p-value: 8.039e-07
v. log(psoda) ~ prpblck ~ log(income)+ prppov The coef decrease to
0.07281 and the significant level also decreases
model <- lm(log(psoda)~prpblck+ log(income)+prppov, data=data8)
summary(model)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = data8)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32218 -0.04648 0.00651 0.04272 0.35622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.46333 0.29371 -4.982 9.4e-07 ***
## prpblck 0.07281 0.03068 2.373 0.0181 *
## log(income) 0.13696 0.02676 5.119 4.8e-07 ***
## prppov 0.38036 0.13279 2.864 0.0044 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08137 on 397 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.08696, Adjusted R-squared: 0.08006
## F-statistic: 12.6 on 3 and 397 DF, p-value: 6.917e-08
v. correlation The correlation is not as my expectation. It tells
that proportion of poverty and price of soda is only 0.0259, meaning no
relation. I expected the poorer the population, the less price we
obtain. # Remove missing values from the vectors
data8_6 <- data8 %>% select(psoda,prppov) %>% na.omit
cor(data8_6)
## psoda prppov
## psoda 1.00000000 0.02598077
## prppov 0.02598077 1.00000000
C9 i. model report
model <- lm(log(psoda)~prpblck+ log(income)+prppov, data=data8)
summary(model)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = data8)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.32218 -0.04648 0.00651 0.04272 0.35622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.46333 0.29371 -4.982 9.4e-07 ***
## prpblck 0.07281 0.03068 2.373 0.0181 *
## log(income) 0.13696 0.02676 5.119 4.8e-07 ***
## prppov 0.38036 0.13279 2.864 0.0044 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08137 on 397 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.08696, Adjusted R-squared: 0.08006
## F-statistic: 12.6 on 3 and 397 DF, p-value: 6.917e-08
ii. correlation The correlation is -0.838 Both variables are
statistically significant with p(log(income)~0, p(prppove)= 0.0044
data9_2 <- data8 %>% select(income,prppov) %>% na.omit %>% mutate(income= log(income))
cor(data9_2)
## income prppov
## income 1.000000 -0.838467
## prppov -0.838467 1.000000
iii. Model modification The coef is 0.121, meaning that an increase
1 USD in housing value will result in 0.121 USD in price of soda. And
the null hypothesis is strongly reject with p-value~0
model <- lm(log(psoda)~prpblck+ log(income)+prppov+log(hseval), data=data8)
summary(model)
##
## Call:
## lm(formula = log(psoda) ~ prpblck + log(income) + prppov + log(hseval),
## data = data8)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.30652 -0.04380 0.00701 0.04332 0.35272
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.84151 0.29243 -2.878 0.004224 **
## prpblck 0.09755 0.02926 3.334 0.000937 ***
## log(income) -0.05299 0.03753 -1.412 0.158706
## prppov 0.05212 0.13450 0.388 0.698571
## log(hseval) 0.12131 0.01768 6.860 2.67e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.07702 on 396 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.1839, Adjusted R-squared: 0.1757
## F-statistic: 22.31 on 4 and 396 DF, p-value: < 2.2e-16
iv. Insignificant value At 5% significance level, the 2 variables
have joint effect
modelur <- lm(log(psoda)~prpblck+ log(income)+prppov+log(hseval), data=data8)
modelr <- lm(log(psoda)~prpblck+log(hseval), data=data8)
ftest <- anova(modelur, modelr)
print(ftest)
## Analysis of Variance Table
##
## Model 1: log(psoda) ~ prpblck + log(income) + prppov + log(hseval)
## Model 2: log(psoda) ~ prpblck + log(hseval)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 396 2.3493
## 2 398 2.3911 -2 -0.041797 3.5227 0.03045 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
v. Model Intepretation Based on the model in part 9.3, we conclude
that prpblck and log(hseval) are two most reliable variables because of
their significance value in both coefficients and small p-value