library(wooldridge)
## Warning: package 'wooldridge' was built under R version 4.2.3
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.2.3
data <- wooldridge::hprice2
head(data)
## price crime nox rooms dist radial proptax stratio lowstat lprice lnox
## 1 24000 0.006 5.38 6.57 4.09 1 29.6 15.3 4.98 10.085809 1.682688
## 2 21599 0.027 4.69 6.42 4.97 2 24.2 17.8 9.14 9.980402 1.545433
## 3 34700 0.027 4.69 7.18 4.97 2 24.2 17.8 4.03 10.454495 1.545433
## 4 33400 0.032 4.58 7.00 6.06 3 22.2 18.7 2.94 10.416311 1.521699
## 5 36199 0.069 4.58 7.15 6.06 3 22.2 18.7 5.33 10.496787 1.521699
## 6 28701 0.030 4.58 6.43 6.06 3 22.2 18.7 5.21 10.264688 1.521699
## lproptax
## 1 5.690360
## 2 5.488938
## 3 5.488938
## 4 5.402678
## 5 5.402678
## 6 5.402678
data1 <- data.frame(data$dist,data$price)
head(data1)
## data.dist data.price
## 1 4.09 24000
## 2 4.97 21599
## 3 4.97 34700
## 4 6.06 33400
## 5 6.06 36199
## 6 6.06 28701
View(data1)
model1 <- lm(data$price ~ data$dist, data = data1)
summary(model1)
##
## Call:
## lm(formula = data$price ~ data$dist, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -14998 -5595 -1879 2304 30396
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18373.3 818.6 22.45 < 2e-16 ***
## data$dist 1090.2 188.6 5.78 1.31e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8927 on 504 degrees of freedom
## Multiple R-squared: 0.06217, Adjusted R-squared: 0.06031
## F-statistic: 33.41 on 1 and 504 DF, p-value: 1.309e-08
plot(data$price, data$dist, main = "Scatter plot")
correlation <- cor(data$dist,data$price)
cat("Correlation coefficient:", correlation, "\n")
## Correlation coefficient: 0.2493394
model2 <- glm(log(data$price) ~ log(data$dist), data = data1)
summary(model2)
##
## Call:
## glm(formula = log(data$price) ~ log(data$dist), data = data1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.18184 -0.21302 -0.02242 0.16747 1.20554
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.57679 0.04033 237.479 <2e-16 ***
## log(data$dist) 0.30657 0.03091 9.919 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1404129)
##
## Null deviance: 84.582 on 505 degrees of freedom
## Residual deviance: 70.768 on 504 degrees of freedom
## AIC: 446.6
##
## Number of Fisher Scoring iterations: 2
plot(log(data$price), log(data$dist), main = "Scatter plot")
correlation <- cor(log(data$dist), log(data$price))
cat("Correlation coefficient:", correlation, "\n")
## Correlation coefficient: 0.4041306
data2 <- wooldridge::wage2
head(data2)
## wage hours IQ KWW educ exper tenure age married black south urban sibs
## 1 769 40 93 35 12 11 2 31 1 0 0 1 1
## 2 808 50 119 41 18 11 16 37 1 0 0 1 1
## 3 825 40 108 46 14 11 9 33 1 0 0 1 1
## 4 650 40 96 32 12 13 7 32 1 0 0 1 4
## 5 562 40 74 27 11 14 5 34 1 0 0 1 10
## 6 1400 40 116 43 16 14 2 35 1 1 0 1 1
## brthord meduc feduc lwage
## 1 2 8 8 6.645091
## 2 NA 14 14 6.694562
## 3 2 14 14 6.715384
## 4 3 12 12 6.476973
## 5 6 6 11 6.331502
## 6 2 8 NA 7.244227
a <- mean(wage2$wage)
a
## [1] 957.9455
b <- mean(wage2$IQ)
b
## [1] 101.2824
a1<-sd(wage2$wage)
a1
## [1] 404.3608
b1<-sd(wage2$IQ)
b1
## [1] 15.05264
model3 <- lm(wage ~ IQ, data = data2)
summary(model3)
##
## Call:
## lm(formula = wage ~ IQ, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -898.7 -256.5 -47.3 201.1 2072.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 116.9916 85.6415 1.366 0.172
## IQ 8.3031 0.8364 9.927 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 384.8 on 933 degrees of freedom
## Multiple R-squared: 0.09554, Adjusted R-squared: 0.09457
## F-statistic: 98.55 on 1 and 933 DF, p-value: < 2.2e-16
###wage=116.9916+8.3031IQ ### The coefficient of IQ is 8.3031. It means as IQ increases by one unit, the wage constantly increases by 8.3031 unit.
IQ1 <- 0
wage1<- 116.9916+8.3031*IQ1
wage1
## [1] 116.9916
IQ2 <- 0
wage2<- 116.9916+8.3031*IQ2
wage2
## [1] 116.9916
wage2-wage1
## [1] 0
model4 <- glm(log(wage) ~ IQ, data = data2)
summary(model4)
##
## Call:
## glm(formula = log(wage) ~ IQ, data = data2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.09324 -0.25547 0.02261 0.27544 1.21486
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.8869943 0.0890206 66.13 <2e-16 ***
## IQ 0.0088072 0.0008694 10.13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1599584)
##
## Null deviance: 165.66 on 934 degrees of freedom
## Residual deviance: 149.24 on 933 degrees of freedom
## AIC: 943.71
##
## Number of Fisher Scoring iterations: 2
IQ3<-0
wage3=5.887+0.0088*IQ3
wage3
## [1] 5.887
IQ4<-15
wage4=5.887+0.0088*IQ4
wage4
## [1] 6.019
100-(wage3/wage4)*100
## [1] 2.193055
model5 <- lm(IQ ~ educ, data = data2)
summary(model5)
##
## Call:
## lm(formula = IQ ~ educ, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.228 -7.262 0.907 8.772 37.373
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 53.6872 2.6229 20.47 <2e-16 ***
## educ 3.5338 0.1922 18.39 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12.9 on 933 degrees of freedom
## Multiple R-squared: 0.2659, Adjusted R-squared: 0.2652
## F-statistic: 338 on 1 and 933 DF, p-value: < 2.2e-16
model6 <- glm(log(wage) ~ educ, data = data2)
summary(model6)
##
## Call:
## glm(formula = log(wage) ~ educ, data = data2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.94620 -0.24832 0.03507 0.27440 1.28106
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.973062 0.081374 73.40 <2e-16 ***
## educ 0.059839 0.005963 10.04 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1602557)
##
## Null deviance: 165.66 on 934 degrees of freedom
## Residual deviance: 149.52 on 933 degrees of freedom
## AIC: 945.44
##
## Number of Fisher Scoring iterations: 2
model7 <- glm(log(wage) ~ educ + IQ, data = data2)
summary(model7)
##
## Call:
## glm(formula = log(wage) ~ educ + IQ, data = data2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.01601 -0.24367 0.03359 0.27960 1.23783
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.6582876 0.0962408 58.793 < 2e-16 ***
## educ 0.0391199 0.0068382 5.721 1.43e-08 ***
## IQ 0.0058631 0.0009979 5.875 5.87e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.1546978)
##
## Null deviance: 165.66 on 934 degrees of freedom
## Residual deviance: 144.18 on 932 degrees of freedom
## AIC: 913.44
##
## Number of Fisher Scoring iterations: 2
data3 <- discrim
head(data3)
## psoda pfries pentree wagest nmgrs nregs hrsopen emp psoda2 pfries2 pentree2
## 1 1.12 1.06 1.02 4.25 3 5 16.0 27.5 1.11 1.11 1.05
## 2 1.06 0.91 0.95 4.75 3 3 16.5 21.5 1.05 0.89 0.95
## 3 1.06 0.91 0.98 4.25 3 5 18.0 30.0 1.05 0.94 0.98
## 4 1.12 1.02 1.06 5.00 4 5 16.0 27.5 1.15 1.05 1.05
## 5 1.12 NA 0.49 5.00 3 3 16.0 5.0 1.04 1.01 0.58
## 6 1.06 0.95 1.01 4.25 4 4 15.0 17.5 1.05 0.94 1.00
## wagest2 nmgrs2 nregs2 hrsopen2 emp2 compown chain density crmrte state
## 1 5.05 5 5 15.0 27.0 1 3 4030 0.0528866 1
## 2 5.05 4 3 17.5 24.5 0 1 4030 0.0528866 1
## 3 5.05 4 5 17.5 25.0 0 1 11400 0.0360003 1
## 4 5.05 4 5 16.0 NA 0 3 8345 0.0484232 1
## 5 5.05 3 3 16.0 12.0 0 1 720 0.0615890 1
## 6 5.05 3 4 15.0 28.0 0 1 4424 0.0334823 1
## prpblck prppov prpncar hseval nstores income county lpsoda
## 1 0.1711542 0.0365789 0.0788428 148300 3 44534 18 0.11332869
## 2 0.1711542 0.0365789 0.0788428 148300 3 44534 18 0.05826885
## 3 0.0473602 0.0879072 0.2694298 169200 3 41164 12 0.05826885
## 4 0.0528394 0.0591227 0.1366903 171600 3 50366 10 0.11332869
## 5 0.0344800 0.0254145 0.0738020 249100 1 72287 10 0.11332869
## 6 0.0591327 0.0835001 0.1151341 148000 2 44515 18 0.05826885
## lpfries lhseval lincome ldensity NJ BK KFC RR
## 1 0.05826885 11.90699 10.70401 8.301521 1 0 0 1
## 2 -0.09431065 11.90699 10.70401 8.301521 1 1 0 0
## 3 -0.09431065 12.03884 10.62532 9.341369 1 1 0 0
## 4 0.01980261 12.05292 10.82707 9.029418 1 0 0 1
## 5 NA 12.42561 11.18840 6.579251 1 1 0 0
## 6 -0.05129331 11.90497 10.70358 8.394799 1 1 0 0
a <- mean(na.omit(discrim$prpblck))
a
## [1] 0.1134864
a1<- sd(na.omit(discrim$prpblck))
a1
## [1] 0.1824165
b <- mean(na.omit(discrim$income))
b
## [1] 47053.78
b1<- sd(na.omit(discrim$income))
b1
## [1] 13179.29
model8<- lm(psoda ~ prpblck + income, data=data3)
summary(model8)
##
## Call:
## lm(formula = psoda ~ prpblck + income, data = data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.29401 -0.05242 0.00333 0.04231 0.44322
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.563e-01 1.899e-02 50.354 < 2e-16 ***
## prpblck 1.150e-01 2.600e-02 4.423 1.26e-05 ***
## income 1.603e-06 3.618e-07 4.430 1.22e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08611 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.06422, Adjusted R-squared: 0.05952
## F-statistic: 13.66 on 2 and 398 DF, p-value: 1.835e-06
model81<- lm(psoda ~ prpblck, data=data3)
summary(model81)
##
## Call:
## lm(formula = psoda ~ prpblck, data = data3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.30884 -0.05963 0.01135 0.03206 0.44840
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.03740 0.00519 199.87 < 2e-16 ***
## prpblck 0.06493 0.02396 2.71 0.00702 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0881 on 399 degrees of freedom
## (9 observations deleted due to missingness)
## Multiple R-squared: 0.01808, Adjusted R-squared: 0.01561
## F-statistic: 7.345 on 1 and 399 DF, p-value: 0.007015
model9 <- glm(log(psoda) ~ prpblck + log(income), data = data3)
summary(model9)
##
## Call:
## glm(formula = log(psoda) ~ prpblck + log(income), data = data3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.33563 -0.04695 0.00658 0.04334 0.35413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.79377 0.17943 -4.424 1.25e-05 ***
## prpblck 0.12158 0.02575 4.722 3.24e-06 ***
## log(income) 0.07651 0.01660 4.610 5.43e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.006740526)
##
## Null deviance: 2.8788 on 400 degrees of freedom
## Residual deviance: 2.6827 on 398 degrees of freedom
## (9 observations deleted due to missingness)
## AIC: -861.87
##
## Number of Fisher Scoring iterations: 2
model10 <- glm(log(psoda) ~ prpblck + log(income) + prppov, data = data3)
summary(model10)
##
## Call:
## glm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = data3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.32218 -0.04648 0.00651 0.04272 0.35622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.46333 0.29371 -4.982 9.4e-07 ***
## prpblck 0.07281 0.03068 2.373 0.0181 *
## log(income) 0.13696 0.02676 5.119 4.8e-07 ***
## prppov 0.38036 0.13279 2.864 0.0044 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.006620679)
##
## Null deviance: 2.8788 on 400 degrees of freedom
## Residual deviance: 2.6284 on 397 degrees of freedom
## (9 observations deleted due to missingness)
## AIC: -868.07
##
## Number of Fisher Scoring iterations: 2
a <- log(data3$income)
a1<-na.omit(a)
b<-na.omit(data3$prppov)
cor(a1,b)
## [1] -0.838467
data4 <- wooldridge::return
head(data4)
## roe rok dkr eps netinc sp90 sp94 salary return lsalary
## 1 18.7 17.4 4.0 48.1 1144 59.375 47.000 1090 -20.842106 6.993933
## 2 1.6 2.4 27.3 -85.3 35 47.875 43.500 1923 -9.138381 7.561642
## 3 4.9 4.6 36.8 -44.1 127 39.000 72.625 1012 86.217949 6.919684
## 4 11.1 8.6 46.4 192.4 367 61.250 142.000 579 131.836731 6.361302
## 5 5.6 4.5 36.2 -60.4 214 58.000 53.250 600 -8.189655 6.396930
## 6 3.5 2.9 18.7 -79.8 118 68.250 50.500 735 -26.007326 6.599871
## lsp90 lnetinc
## 1 4.083873 7.042286
## 2 3.868593 3.555348
## 3 3.663562 4.844187
## 4 4.114964 5.905362
## 5 4.060443 5.365976
## 6 4.223177 4.770685
model11<- lm(return ~ dkr + eps + netinc + salary, data=data4)
summary(model11)
##
## Call:
## lm(formula = return ~ dkr + eps + netinc + salary, data = data4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -88.629 -25.421 -4.215 18.326 124.627
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -14.370216 6.893616 -2.085 0.039 *
## dkr 0.320544 0.200911 1.595 0.113
## eps 0.042699 0.078138 0.546 0.586
## netinc -0.005109 0.004675 -1.093 0.276
## salary 0.003499 0.002194 1.595 0.113
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 39.19 on 137 degrees of freedom
## Multiple R-squared: 0.03948, Adjusted R-squared: 0.01143
## F-statistic: 1.408 on 4 and 137 DF, p-value: 0.2347
coef(summary(model11))[, "Pr(>|t|)"]
## (Intercept) dkr eps netinc salary
## 0.03896503 0.11291423 0.58564681 0.27639712 0.11295187
model12<- glm(return ~ dkr + eps + log(netinc) + log(salary), data=data4)
summary(model12)
##
## Call:
## glm(formula = return ~ dkr + eps + log(netinc) + log(salary),
## data = data4)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -80.402 -26.729 -4.223 19.475 126.948
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -36.29934 39.37380 -0.922 0.358
## dkr 0.32658 0.20265 1.612 0.109
## eps 0.06854 0.08035 0.853 0.395
## log(netinc) -4.74530 3.38566 -1.402 0.163
## log(salary) 7.24181 6.31251 1.147 0.253
##
## (Dispersion parameter for gaussian family taken to be 1546.397)
##
## Null deviance: 219096 on 141 degrees of freedom
## Residual deviance: 211856 on 137 degrees of freedom
## AIC: 1452.7
##
## Number of Fisher Scoring iterations: 2
coef(summary(model12))[, "Pr(>|t|)"]
## (Intercept) dkr eps log(netinc) log(salary)
## 0.3581930 0.1093567 0.3951521 0.1633003 0.2532917
c <-na.omit(data4$dkr)
model13<- glm(return ~ dkr + log(eps) + log(netinc) + log(salary), data=data4)
## Warning in log(eps): NaNs produced
summary(model13)
##
## Call:
## glm(formula = return ~ dkr + log(eps) + log(netinc) + log(salary),
## data = data4)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -74.544 -23.939 -2.665 20.063 130.803
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -78.1128 46.3123 -1.687 0.0954 .
## dkr 0.3437 0.2489 1.381 0.1710
## log(eps) 4.3074 4.3168 0.998 0.3213
## log(netinc) -5.2817 4.7835 -1.104 0.2727
## log(salary) 11.2767 7.5725 1.489 0.1402
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1497.742)
##
## Null deviance: 132845 on 87 degrees of freedom
## Residual deviance: 124313 on 83 degrees of freedom
## (54 observations deleted due to missingness)
## AIC: 900.02
##
## Number of Fisher Scoring iterations: 2
coef(summary(model13))[, "Pr(>|t|)"]
## (Intercept) dkr log(eps) log(netinc) log(salary)
## 0.09542518 0.17099850 0.32126108 0.27271466 0.14023180
model14<- glm(log(psoda) ~ prpblck + log(income) +prppov, data=data3)
summary(model14)
##
## Call:
## glm(formula = log(psoda) ~ prpblck + log(income) + prppov, data = data3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.32218 -0.04648 0.00651 0.04272 0.35622
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.46333 0.29371 -4.982 9.4e-07 ***
## prpblck 0.07281 0.03068 2.373 0.0181 *
## log(income) 0.13696 0.02676 5.119 4.8e-07 ***
## prppov 0.38036 0.13279 2.864 0.0044 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.006620679)
##
## Null deviance: 2.8788 on 400 degrees of freedom
## Residual deviance: 2.6284 on 397 degrees of freedom
## (9 observations deleted due to missingness)
## AIC: -868.07
##
## Number of Fisher Scoring iterations: 2
coef(summary(model14))[, "Pr(>|t|)"]
## (Intercept) prpblck log(income) prppov
## 9.400188e-07 1.809760e-02 4.802041e-07 4.400360e-03
a <- log(data3$income)
a1<-na.omit(a)
b<-na.omit(data3$prppov)
cor(a1,b)
## [1] -0.838467
model15<- glm(log(psoda) ~ prpblck + log(income) +prppov + log(hseval), data=data3)
summary(model15)
##
## Call:
## glm(formula = log(psoda) ~ prpblck + log(income) + prppov + log(hseval),
## data = data3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.30652 -0.04380 0.00701 0.04332 0.35272
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.84151 0.29243 -2.878 0.004224 **
## prpblck 0.09755 0.02926 3.334 0.000937 ***
## log(income) -0.05299 0.03753 -1.412 0.158706
## prppov 0.05212 0.13450 0.388 0.698571
## log(hseval) 0.12131 0.01768 6.860 2.67e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.005932479)
##
## Null deviance: 2.8788 on 400 degrees of freedom
## Residual deviance: 2.3493 on 396 degrees of freedom
## (9 observations deleted due to missingness)
## AIC: -911.1
##
## Number of Fisher Scoring iterations: 2
##iv).
coef(summary(model15))[, "Pr(>|t|)"]
## (Intercept) prpblck log(income) prppov log(hseval)
## 4.223540e-03 9.373070e-04 1.587065e-01 6.985706e-01 2.668125e-11