Load libraries
library(arm)
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 4.0.5
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.0.5
## Loading required package: lme4
## Warning: package 'lme4' was built under R version 4.0.5
##
## arm (Version 1.12-2, built: 2021-10-15)
## Working directory is /Users/gennacampain/Desktop/Spring 2022/Knowledge Mining/Homework
library(MASS)
library(ISLR)
Load data
attach(Boston)
names(Boston)
## [1] "crim" "zn" "indus" "chas" "nox" "rm" "age"
## [8] "dis" "rad" "tax" "ptratio" "black" "lstat" "medv"
plot(medv~lstat,Boston)
fit1 <- lm(medv~lstat,data=Boston)
fit1
##
## Call:
## lm(formula = medv ~ lstat, data = Boston)
##
## Coefficients:
## (Intercept) lstat
## 34.55 -0.95
summary(fit1)
##
## Call:
## lm(formula = medv ~ lstat, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.168 -3.990 -1.318 2.034 24.500
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34.55384 0.56263 61.41 <2e-16 ***
## lstat -0.95005 0.03873 -24.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared: 0.5441, Adjusted R-squared: 0.5432
## F-statistic: 601.6 on 1 and 504 DF, p-value: < 2.2e-16
abline(fit1,col="red")
names(fit1)
## [1] "coefficients" "residuals" "effects" "rank"
## [5] "fitted.values" "assign" "qr" "df.residual"
## [9] "xlevels" "call" "terms" "model"
confint(fit1)
## 2.5 % 97.5 %
## (Intercept) 33.448457 35.6592247
## lstat -1.026148 -0.8739505
Prediction using values in lstat Prediction interval uses sample mean and takes into account the variability of the estimators for μ and σ. Therefore, the interval will be wider.
predict(fit1,data.frame(lstat=c(0,5,10,15)),interval="confidence") # confidence intervals
## fit lwr upr
## 1 34.55384 33.44846 35.65922
## 2 29.80359 29.00741 30.59978
## 3 25.05335 24.47413 25.63256
## 4 20.30310 19.73159 20.87461
predict(fit1,data.frame(lstat=c(0,5,10,15)),interval="prediction") # prediction intervals
## fit lwr upr
## 1 34.55384 22.291923 46.81576
## 2 29.80359 17.565675 42.04151
## 3 25.05335 12.827626 37.27907
## 4 20.30310 8.077742 32.52846
fit2 <- lm(medv~lstat+age,data=Boston)
summary(fit2)
##
## Call:
## lm(formula = medv ~ lstat + age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.981 -3.978 -1.283 1.968 23.158
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.22276 0.73085 45.458 < 2e-16 ***
## lstat -1.03207 0.04819 -21.416 < 2e-16 ***
## age 0.03454 0.01223 2.826 0.00491 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.173 on 503 degrees of freedom
## Multiple R-squared: 0.5513, Adjusted R-squared: 0.5495
## F-statistic: 309 on 2 and 503 DF, p-value: < 2.2e-16
fit3 <- lm(medv~.,Boston)
summary(fit3)
##
## Call:
## lm(formula = medv ~ ., data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.595 -2.730 -0.518 1.777 26.199
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.646e+01 5.103e+00 7.144 3.28e-12 ***
## crim -1.080e-01 3.286e-02 -3.287 0.001087 **
## zn 4.642e-02 1.373e-02 3.382 0.000778 ***
## indus 2.056e-02 6.150e-02 0.334 0.738288
## chas 2.687e+00 8.616e-01 3.118 0.001925 **
## nox -1.777e+01 3.820e+00 -4.651 4.25e-06 ***
## rm 3.810e+00 4.179e-01 9.116 < 2e-16 ***
## age 6.922e-04 1.321e-02 0.052 0.958229
## dis -1.476e+00 1.995e-01 -7.398 6.01e-13 ***
## rad 3.060e-01 6.635e-02 4.613 5.07e-06 ***
## tax -1.233e-02 3.760e-03 -3.280 0.001112 **
## ptratio -9.527e-01 1.308e-01 -7.283 1.31e-12 ***
## black 9.312e-03 2.686e-03 3.467 0.000573 ***
## lstat -5.248e-01 5.072e-02 -10.347 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7338
## F-statistic: 108.1 on 13 and 492 DF, p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(fit3)
Respecify model remove age and indus variables
fit4 <- update(fit3,~.-age-indus)
summary(fit4)
##
## Call:
## lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad +
## tax + ptratio + black + lstat, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.5984 -2.7386 -0.5046 1.7273 26.2373
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.341145 5.067492 7.171 2.73e-12 ***
## crim -0.108413 0.032779 -3.307 0.001010 **
## zn 0.045845 0.013523 3.390 0.000754 ***
## chas 2.718716 0.854240 3.183 0.001551 **
## nox -17.376023 3.535243 -4.915 1.21e-06 ***
## rm 3.801579 0.406316 9.356 < 2e-16 ***
## dis -1.492711 0.185731 -8.037 6.84e-15 ***
## rad 0.299608 0.063402 4.726 3.00e-06 ***
## tax -0.011778 0.003372 -3.493 0.000521 ***
## ptratio -0.946525 0.129066 -7.334 9.24e-13 ***
## black 0.009291 0.002674 3.475 0.000557 ***
## lstat -0.522553 0.047424 -11.019 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.736 on 494 degrees of freedom
## Multiple R-squared: 0.7406, Adjusted R-squared: 0.7348
## F-statistic: 128.2 on 11 and 494 DF, p-value: < 2.2e-16
Set plot configuration
par(mfrow=c(2,2))
plot(fit4)
Uses coefplot to plot coefficients. Note the line at 0.
par(mfrow=c(1,1))
arm::coefplot(fit4)
### Nonlinear Terms and Interactions
fit5 <- lm(medv~lstat*age,Boston)
summary(fit5)
##
## Call:
## lm(formula = medv ~ lstat * age, data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.806 -4.045 -1.333 2.085 27.552
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.0885359 1.4698355 24.553 < 2e-16 ***
## lstat -1.3921168 0.1674555 -8.313 8.78e-16 ***
## age -0.0007209 0.0198792 -0.036 0.9711
## lstat:age 0.0041560 0.0018518 2.244 0.0252 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.149 on 502 degrees of freedom
## Multiple R-squared: 0.5557, Adjusted R-squared: 0.5531
## F-statistic: 209.3 on 3 and 502 DF, p-value: < 2.2e-16
I() identity function for squared term to interpret as-is Combine two command lines with semicolon
fit6 <- lm(medv~lstat +I(lstat^2),Boston); summary(fit6)
##
## Call:
## lm(formula = medv ~ lstat + I(lstat^2), data = Boston)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.2834 -3.8313 -0.5295 2.3095 25.4148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 42.862007 0.872084 49.15 <2e-16 ***
## lstat -2.332821 0.123803 -18.84 <2e-16 ***
## I(lstat^2) 0.043547 0.003745 11.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.524 on 503 degrees of freedom
## Multiple R-squared: 0.6407, Adjusted R-squared: 0.6393
## F-statistic: 448.5 on 2 and 503 DF, p-value: < 2.2e-16
par(mfrow=c(1,1))
plot(medv~lstat)
points(lstat,fitted(fit6),col="red",pch=20)
fit7 <- lm(medv~poly(lstat,4))
points(lstat,fitted(fit7),col="blue",pch=20)
### Qualitative Predictors
names(Carseats)
## [1] "Sales" "CompPrice" "Income" "Advertising" "Population"
## [6] "Price" "ShelveLoc" "Age" "Education" "Urban"
## [11] "US"
summary(Carseats)
## Sales CompPrice Income Advertising
## Min. : 0.000 Min. : 77 Min. : 21.00 Min. : 0.000
## 1st Qu.: 5.390 1st Qu.:115 1st Qu.: 42.75 1st Qu.: 0.000
## Median : 7.490 Median :125 Median : 69.00 Median : 5.000
## Mean : 7.496 Mean :125 Mean : 68.66 Mean : 6.635
## 3rd Qu.: 9.320 3rd Qu.:135 3rd Qu.: 91.00 3rd Qu.:12.000
## Max. :16.270 Max. :175 Max. :120.00 Max. :29.000
## Population Price ShelveLoc Age Education
## Min. : 10.0 Min. : 24.0 Bad : 96 Min. :25.00 Min. :10.0
## 1st Qu.:139.0 1st Qu.:100.0 Good : 85 1st Qu.:39.75 1st Qu.:12.0
## Median :272.0 Median :117.0 Medium:219 Median :54.50 Median :14.0
## Mean :264.8 Mean :115.8 Mean :53.32 Mean :13.9
## 3rd Qu.:398.5 3rd Qu.:131.0 3rd Qu.:66.00 3rd Qu.:16.0
## Max. :509.0 Max. :191.0 Max. :80.00 Max. :18.0
## Urban US
## No :118 No :142
## Yes:282 Yes:258
##
##
##
##
fit1 <- lm(Sales~.+Income:Advertising+Age:Price,Carseats) # add two interaction terms
summary(fit1)
##
## Call:
## lm(formula = Sales ~ . + Income:Advertising + Age:Price, data = Carseats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.9208 -0.7503 0.0177 0.6754 3.3413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.5755654 1.0087470 6.519 2.22e-10 ***
## CompPrice 0.0929371 0.0041183 22.567 < 2e-16 ***
## Income 0.0108940 0.0026044 4.183 3.57e-05 ***
## Advertising 0.0702462 0.0226091 3.107 0.002030 **
## Population 0.0001592 0.0003679 0.433 0.665330
## Price -0.1008064 0.0074399 -13.549 < 2e-16 ***
## ShelveLocGood 4.8486762 0.1528378 31.724 < 2e-16 ***
## ShelveLocMedium 1.9532620 0.1257682 15.531 < 2e-16 ***
## Age -0.0579466 0.0159506 -3.633 0.000318 ***
## Education -0.0208525 0.0196131 -1.063 0.288361
## UrbanYes 0.1401597 0.1124019 1.247 0.213171
## USYes -0.1575571 0.1489234 -1.058 0.290729
## Income:Advertising 0.0007510 0.0002784 2.698 0.007290 **
## Price:Age 0.0001068 0.0001333 0.801 0.423812
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.011 on 386 degrees of freedom
## Multiple R-squared: 0.8761, Adjusted R-squared: 0.8719
## F-statistic: 210 on 13 and 386 DF, p-value: < 2.2e-16
attach(Carseats)
contrasts(Carseats$ShelveLoc)
## Good Medium
## Bad 0 0
## Good 1 0
## Medium 0 1
Combine the lm, plot and abline functions to create a one step regression fit plot function
regplot=function(x,y){
fit=lm(y~x)
plot(x,y)
abline(fit,col="red")
}
attach(Carseats)
## The following objects are masked from Carseats (pos = 3):
##
## Advertising, Age, CompPrice, Education, Income, Population, Price,
## Sales, ShelveLoc, Urban, US
regplot(Price,Sales)
# allow extra room for additional arguments
regplot=function(x,y,...){
fit=lm(y~x)
plot(x,y,...)
abline(fit,col="red")
}
regplot(Price,Sales,xlab="Price",ylab="Sales",col="blue",pch=20)
## Additional note: try out the coefplot2 package to finetune the coefplots
library(coefplot2)
## Loading required package: coda
##
## Attaching package: 'coda'
## The following object is masked from 'package:arm':
##
## traceplot
coefplot2(fit3)
# Assignment 6 Use the TEDS2016 dataset to run a logit (logistic regression) model using female as sole predictor. The dependent variable is the vote (1-0) for Tsai Ing-wen, the female candidate for the then opposition party Democratic Progressive Party (DPP). Access the data set using the following codes:
setwd("~/Desktop/Spring 2022/Knowledge Mining")
library(haven)
TEDS_2016 <- read_dta("TEDS_2016.dta")
Are female voters more likely to vote for President Tsai? Why or why not? The coefficient for female is -0.06517 but it is not significant at any level. This suggests that there is no significant difference in the voting patterns of males and females in regard to votes for President Tsai. If anything, the coefficient suggests that females are less likely to vote for President Tsai.
glm.vt <- glm(votetsai~female, data=TEDS_2016,family=binomial)
summary(glm.vt)
##
## Call:
## glm(formula = votetsai ~ female, family = binomial, data = TEDS_2016)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4180 -1.3889 0.9546 0.9797 0.9797
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.54971 0.08245 6.667 2.61e-11 ***
## female -0.06517 0.11644 -0.560 0.576
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1666.5 on 1260 degrees of freedom
## Residual deviance: 1666.2 on 1259 degrees of freedom
## (429 observations deleted due to missingness)
## AIC: 1670.2
##
## Number of Fisher Scoring iterations: 4
Add party ID variables (KMT, DPP) and other demographic variables (age, edu, income) to improve the model. What do you find? Which group of variables work better in explaining/predicting votetsai These variables appear to predict votetsai better based on the AIC which is 850 in this model compared with 1670 in the other model.
glm.vt2 <- glm(votetsai~female + KMT + DPP + age + edu + income, data=TEDS_2016,family=binomial)
summary(glm.vt2)
##
## Call:
## glm(formula = votetsai ~ female + KMT + DPP + age + edu + income,
## family = binomial, data = TEDS_2016)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7360 -0.3673 0.2408 0.2946 2.5408
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.618640 0.592084 2.734 0.00626 **
## female 0.047406 0.177403 0.267 0.78930
## KMT -3.156273 0.250360 -12.607 < 2e-16 ***
## DPP 2.888943 0.267968 10.781 < 2e-16 ***
## age -0.011808 0.007164 -1.648 0.09931 .
## edu -0.184604 0.083102 -2.221 0.02632 *
## income 0.013727 0.034382 0.399 0.68971
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1661.76 on 1256 degrees of freedom
## Residual deviance: 836.15 on 1250 degrees of freedom
## (433 observations deleted due to missingness)
## AIC: 850.15
##
## Number of Fisher Scoring iterations: 6
names(glm.vt2)
## [1] "coefficients" "residuals" "fitted.values"
## [4] "effects" "R" "rank"
## [7] "qr" "family" "linear.predictors"
## [10] "deviance" "aic" "null.deviance"
## [13] "iter" "weights" "prior.weights"
## [16] "df.residual" "df.null" "y"
## [19] "converged" "boundary" "model"
## [22] "na.action" "call" "formula"
## [25] "terms" "data" "offset"
## [28] "control" "method" "contrasts"
## [31] "xlevels"
Try adding the following variables: Independence – Supporting Taiwan’s Independence (vs. Unification with China) Econ_worse – Evaluations of economy (Negative) Govt_dont_care – Political Efficacy (Government does not care about people) Minnan_father – Descendent of local Taiwanese Mainland_father – Descendent of mainland China (migrated from mainland circa or after 1949) Taiwanese – Self-identified Taiwanese The model with all variables has an AIC of 793 which is the smallest AIC of all three, suggesting that this is the best model for predicting votetsai.
glm.vt3 <- glm(votetsai~female + KMT + DPP + age + edu + income + Independence + Econ_worse + Govt_dont_care + Minnan_father + Mainland_father + Taiwanese, data=TEDS_2016,family=binomial)
summary(glm.vt3)
##
## Call:
## glm(formula = votetsai ~ female + KMT + DPP + age + edu + income +
## Independence + Econ_worse + Govt_dont_care + Minnan_father +
## Mainland_father + Taiwanese, family = binomial, data = TEDS_2016)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1060 -0.3143 0.1744 0.3975 2.7917
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.015976 0.679780 -0.024 0.98125
## female -0.097996 0.189840 -0.516 0.60571
## KMT -2.922246 0.259333 -11.268 < 2e-16 ***
## DPP 2.468855 0.275350 8.966 < 2e-16 ***
## age 0.003287 0.007884 0.417 0.67672
## edu -0.092110 0.090119 -1.022 0.30674
## income 0.021771 0.036406 0.598 0.54984
## Independence 1.020953 0.251776 4.055 5.01e-05 ***
## Econ_worse 0.310462 0.189100 1.642 0.10063
## Govt_dont_care -0.014295 0.188765 -0.076 0.93964
## Minnan_father -0.247650 0.253921 -0.975 0.32941
## Mainland_father -1.089332 0.396822 -2.745 0.00605 **
## Taiwanese 0.909019 0.198930 4.570 4.89e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1661.76 on 1256 degrees of freedom
## Residual deviance: 767.13 on 1244 degrees of freedom
## (433 observations deleted due to missingness)
## AIC: 793.13
##
## Number of Fisher Scoring iterations: 6