Lab_LinearRegression01

Load libraries

library(arm)
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 4.0.5
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.0.5
## Loading required package: lme4
## Warning: package 'lme4' was built under R version 4.0.5
## 
## arm (Version 1.12-2, built: 2021-10-15)
## Working directory is /Users/gennacampain/Desktop/Spring 2022/Knowledge Mining/Homework
library(MASS)
library(ISLR)

Load data

attach(Boston)
names(Boston)
##  [1] "crim"    "zn"      "indus"   "chas"    "nox"     "rm"      "age"    
##  [8] "dis"     "rad"     "tax"     "ptratio" "black"   "lstat"   "medv"

Simple linear regression

plot(medv~lstat,Boston)
fit1 <- lm(medv~lstat,data=Boston)
fit1
## 
## Call:
## lm(formula = medv ~ lstat, data = Boston)
## 
## Coefficients:
## (Intercept)        lstat  
##       34.55        -0.95
summary(fit1)
## 
## Call:
## lm(formula = medv ~ lstat, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.168  -3.990  -1.318   2.034  24.500 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 34.55384    0.56263   61.41   <2e-16 ***
## lstat       -0.95005    0.03873  -24.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.216 on 504 degrees of freedom
## Multiple R-squared:  0.5441, Adjusted R-squared:  0.5432 
## F-statistic: 601.6 on 1 and 504 DF,  p-value: < 2.2e-16
abline(fit1,col="red")

names(fit1)
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "xlevels"       "call"          "terms"         "model"
confint(fit1)
##                 2.5 %     97.5 %
## (Intercept) 33.448457 35.6592247
## lstat       -1.026148 -0.8739505

Prediction using values in lstat Prediction interval uses sample mean and takes into account the variability of the estimators for μ and σ. Therefore, the interval will be wider.

predict(fit1,data.frame(lstat=c(0,5,10,15)),interval="confidence") # confidence intervals
##        fit      lwr      upr
## 1 34.55384 33.44846 35.65922
## 2 29.80359 29.00741 30.59978
## 3 25.05335 24.47413 25.63256
## 4 20.30310 19.73159 20.87461
predict(fit1,data.frame(lstat=c(0,5,10,15)),interval="prediction") # prediction intervals
##        fit       lwr      upr
## 1 34.55384 22.291923 46.81576
## 2 29.80359 17.565675 42.04151
## 3 25.05335 12.827626 37.27907
## 4 20.30310  8.077742 32.52846

Multiple linear regression

fit2 <- lm(medv~lstat+age,data=Boston)
summary(fit2)
## 
## Call:
## lm(formula = medv ~ lstat + age, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.981  -3.978  -1.283   1.968  23.158 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 33.22276    0.73085  45.458  < 2e-16 ***
## lstat       -1.03207    0.04819 -21.416  < 2e-16 ***
## age          0.03454    0.01223   2.826  0.00491 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.173 on 503 degrees of freedom
## Multiple R-squared:  0.5513, Adjusted R-squared:  0.5495 
## F-statistic:   309 on 2 and 503 DF,  p-value: < 2.2e-16
fit3 <- lm(medv~.,Boston)
summary(fit3)
## 
## Call:
## lm(formula = medv ~ ., data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.595  -2.730  -0.518   1.777  26.199 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.646e+01  5.103e+00   7.144 3.28e-12 ***
## crim        -1.080e-01  3.286e-02  -3.287 0.001087 ** 
## zn           4.642e-02  1.373e-02   3.382 0.000778 ***
## indus        2.056e-02  6.150e-02   0.334 0.738288    
## chas         2.687e+00  8.616e-01   3.118 0.001925 ** 
## nox         -1.777e+01  3.820e+00  -4.651 4.25e-06 ***
## rm           3.810e+00  4.179e-01   9.116  < 2e-16 ***
## age          6.922e-04  1.321e-02   0.052 0.958229    
## dis         -1.476e+00  1.995e-01  -7.398 6.01e-13 ***
## rad          3.060e-01  6.635e-02   4.613 5.07e-06 ***
## tax         -1.233e-02  3.760e-03  -3.280 0.001112 ** 
## ptratio     -9.527e-01  1.308e-01  -7.283 1.31e-12 ***
## black        9.312e-03  2.686e-03   3.467 0.000573 ***
## lstat       -5.248e-01  5.072e-02 -10.347  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.745 on 492 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7338 
## F-statistic: 108.1 on 13 and 492 DF,  p-value: < 2.2e-16
par(mfrow=c(2,2))
plot(fit3)

Respecify model remove age and indus variables

fit4 <- update(fit3,~.-age-indus)
summary(fit4)
## 
## Call:
## lm(formula = medv ~ crim + zn + chas + nox + rm + dis + rad + 
##     tax + ptratio + black + lstat, data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.5984  -2.7386  -0.5046   1.7273  26.2373 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  36.341145   5.067492   7.171 2.73e-12 ***
## crim         -0.108413   0.032779  -3.307 0.001010 ** 
## zn            0.045845   0.013523   3.390 0.000754 ***
## chas          2.718716   0.854240   3.183 0.001551 ** 
## nox         -17.376023   3.535243  -4.915 1.21e-06 ***
## rm            3.801579   0.406316   9.356  < 2e-16 ***
## dis          -1.492711   0.185731  -8.037 6.84e-15 ***
## rad           0.299608   0.063402   4.726 3.00e-06 ***
## tax          -0.011778   0.003372  -3.493 0.000521 ***
## ptratio      -0.946525   0.129066  -7.334 9.24e-13 ***
## black         0.009291   0.002674   3.475 0.000557 ***
## lstat        -0.522553   0.047424 -11.019  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.736 on 494 degrees of freedom
## Multiple R-squared:  0.7406, Adjusted R-squared:  0.7348 
## F-statistic: 128.2 on 11 and 494 DF,  p-value: < 2.2e-16

Set plot configuration

par(mfrow=c(2,2))
plot(fit4)

Uses coefplot to plot coefficients. Note the line at 0.

par(mfrow=c(1,1))
arm::coefplot(fit4)

### Nonlinear Terms and Interactions

fit5 <- lm(medv~lstat*age,Boston)
summary(fit5)
## 
## Call:
## lm(formula = medv ~ lstat * age, data = Boston)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.806  -4.045  -1.333   2.085  27.552 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 36.0885359  1.4698355  24.553  < 2e-16 ***
## lstat       -1.3921168  0.1674555  -8.313 8.78e-16 ***
## age         -0.0007209  0.0198792  -0.036   0.9711    
## lstat:age    0.0041560  0.0018518   2.244   0.0252 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.149 on 502 degrees of freedom
## Multiple R-squared:  0.5557, Adjusted R-squared:  0.5531 
## F-statistic: 209.3 on 3 and 502 DF,  p-value: < 2.2e-16

I() identity function for squared term to interpret as-is Combine two command lines with semicolon

fit6 <- lm(medv~lstat +I(lstat^2),Boston); summary(fit6)
## 
## Call:
## lm(formula = medv ~ lstat + I(lstat^2), data = Boston)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.2834  -3.8313  -0.5295   2.3095  25.4148 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 42.862007   0.872084   49.15   <2e-16 ***
## lstat       -2.332821   0.123803  -18.84   <2e-16 ***
## I(lstat^2)   0.043547   0.003745   11.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.524 on 503 degrees of freedom
## Multiple R-squared:  0.6407, Adjusted R-squared:  0.6393 
## F-statistic: 448.5 on 2 and 503 DF,  p-value: < 2.2e-16
par(mfrow=c(1,1))
plot(medv~lstat)
points(lstat,fitted(fit6),col="red",pch=20)
fit7 <- lm(medv~poly(lstat,4))
points(lstat,fitted(fit7),col="blue",pch=20)

### Qualitative Predictors

names(Carseats)
##  [1] "Sales"       "CompPrice"   "Income"      "Advertising" "Population" 
##  [6] "Price"       "ShelveLoc"   "Age"         "Education"   "Urban"      
## [11] "US"
summary(Carseats)
##      Sales          CompPrice       Income        Advertising    
##  Min.   : 0.000   Min.   : 77   Min.   : 21.00   Min.   : 0.000  
##  1st Qu.: 5.390   1st Qu.:115   1st Qu.: 42.75   1st Qu.: 0.000  
##  Median : 7.490   Median :125   Median : 69.00   Median : 5.000  
##  Mean   : 7.496   Mean   :125   Mean   : 68.66   Mean   : 6.635  
##  3rd Qu.: 9.320   3rd Qu.:135   3rd Qu.: 91.00   3rd Qu.:12.000  
##  Max.   :16.270   Max.   :175   Max.   :120.00   Max.   :29.000  
##    Population        Price        ShelveLoc        Age          Education   
##  Min.   : 10.0   Min.   : 24.0   Bad   : 96   Min.   :25.00   Min.   :10.0  
##  1st Qu.:139.0   1st Qu.:100.0   Good  : 85   1st Qu.:39.75   1st Qu.:12.0  
##  Median :272.0   Median :117.0   Medium:219   Median :54.50   Median :14.0  
##  Mean   :264.8   Mean   :115.8                Mean   :53.32   Mean   :13.9  
##  3rd Qu.:398.5   3rd Qu.:131.0                3rd Qu.:66.00   3rd Qu.:16.0  
##  Max.   :509.0   Max.   :191.0                Max.   :80.00   Max.   :18.0  
##  Urban       US     
##  No :118   No :142  
##  Yes:282   Yes:258  
##                     
##                     
##                     
## 
fit1 <- lm(Sales~.+Income:Advertising+Age:Price,Carseats) # add two interaction terms
summary(fit1)
## 
## Call:
## lm(formula = Sales ~ . + Income:Advertising + Age:Price, data = Carseats)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.9208 -0.7503  0.0177  0.6754  3.3413 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.5755654  1.0087470   6.519 2.22e-10 ***
## CompPrice           0.0929371  0.0041183  22.567  < 2e-16 ***
## Income              0.0108940  0.0026044   4.183 3.57e-05 ***
## Advertising         0.0702462  0.0226091   3.107 0.002030 ** 
## Population          0.0001592  0.0003679   0.433 0.665330    
## Price              -0.1008064  0.0074399 -13.549  < 2e-16 ***
## ShelveLocGood       4.8486762  0.1528378  31.724  < 2e-16 ***
## ShelveLocMedium     1.9532620  0.1257682  15.531  < 2e-16 ***
## Age                -0.0579466  0.0159506  -3.633 0.000318 ***
## Education          -0.0208525  0.0196131  -1.063 0.288361    
## UrbanYes            0.1401597  0.1124019   1.247 0.213171    
## USYes              -0.1575571  0.1489234  -1.058 0.290729    
## Income:Advertising  0.0007510  0.0002784   2.698 0.007290 ** 
## Price:Age           0.0001068  0.0001333   0.801 0.423812    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.011 on 386 degrees of freedom
## Multiple R-squared:  0.8761, Adjusted R-squared:  0.8719 
## F-statistic:   210 on 13 and 386 DF,  p-value: < 2.2e-16
attach(Carseats)
contrasts(Carseats$ShelveLoc)
##        Good Medium
## Bad       0      0
## Good      1      0
## Medium    0      1

Writing R functions

Combine the lm, plot and abline functions to create a one step regression fit plot function

regplot=function(x,y){
  fit=lm(y~x)
  plot(x,y)
  abline(fit,col="red")
}
attach(Carseats)
## The following objects are masked from Carseats (pos = 3):
## 
##     Advertising, Age, CompPrice, Education, Income, Population, Price,
##     Sales, ShelveLoc, Urban, US
regplot(Price,Sales)

# allow extra room for additional arguments
regplot=function(x,y,...){
  fit=lm(y~x)
  plot(x,y,...)
  abline(fit,col="red")
}
regplot(Price,Sales,xlab="Price",ylab="Sales",col="blue",pch=20)

## Additional note: try out the coefplot2 package to finetune the coefplots

library(coefplot2)
## Loading required package: coda
## 
## Attaching package: 'coda'
## The following object is masked from 'package:arm':
## 
##     traceplot
coefplot2(fit3)

# Assignment 6 Use the TEDS2016 dataset to run a logit (logistic regression) model using female as sole predictor. The dependent variable is the vote (1-0) for Tsai Ing-wen, the female candidate for the then opposition party Democratic Progressive Party (DPP). Access the data set using the following codes:

setwd("~/Desktop/Spring 2022/Knowledge Mining")
library(haven)
TEDS_2016 <- read_dta("TEDS_2016.dta")

Are female voters more likely to vote for President Tsai? Why or why not? The coefficient for female is -0.06517 but it is not significant at any level. This suggests that there is no significant difference in the voting patterns of males and females in regard to votes for President Tsai. If anything, the coefficient suggests that females are less likely to vote for President Tsai.

glm.vt <- glm(votetsai~female, data=TEDS_2016,family=binomial)
summary(glm.vt)
## 
## Call:
## glm(formula = votetsai ~ female, family = binomial, data = TEDS_2016)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4180  -1.3889   0.9546   0.9797   0.9797  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.54971    0.08245   6.667 2.61e-11 ***
## female      -0.06517    0.11644  -0.560    0.576    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1666.5  on 1260  degrees of freedom
## Residual deviance: 1666.2  on 1259  degrees of freedom
##   (429 observations deleted due to missingness)
## AIC: 1670.2
## 
## Number of Fisher Scoring iterations: 4

Add party ID variables (KMT, DPP) and other demographic variables (age, edu, income) to improve the model. What do you find? Which group of variables work better in explaining/predicting votetsai These variables appear to predict votetsai better based on the AIC which is 850 in this model compared with 1670 in the other model.

glm.vt2 <- glm(votetsai~female + KMT + DPP + age + edu + income, data=TEDS_2016,family=binomial)
summary(glm.vt2)
## 
## Call:
## glm(formula = votetsai ~ female + KMT + DPP + age + edu + income, 
##     family = binomial, data = TEDS_2016)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7360  -0.3673   0.2408   0.2946   2.5408  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.618640   0.592084   2.734  0.00626 ** 
## female       0.047406   0.177403   0.267  0.78930    
## KMT         -3.156273   0.250360 -12.607  < 2e-16 ***
## DPP          2.888943   0.267968  10.781  < 2e-16 ***
## age         -0.011808   0.007164  -1.648  0.09931 .  
## edu         -0.184604   0.083102  -2.221  0.02632 *  
## income       0.013727   0.034382   0.399  0.68971    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1661.76  on 1256  degrees of freedom
## Residual deviance:  836.15  on 1250  degrees of freedom
##   (433 observations deleted due to missingness)
## AIC: 850.15
## 
## Number of Fisher Scoring iterations: 6
names(glm.vt2)
##  [1] "coefficients"      "residuals"         "fitted.values"    
##  [4] "effects"           "R"                 "rank"             
##  [7] "qr"                "family"            "linear.predictors"
## [10] "deviance"          "aic"               "null.deviance"    
## [13] "iter"              "weights"           "prior.weights"    
## [16] "df.residual"       "df.null"           "y"                
## [19] "converged"         "boundary"          "model"            
## [22] "na.action"         "call"              "formula"          
## [25] "terms"             "data"              "offset"           
## [28] "control"           "method"            "contrasts"        
## [31] "xlevels"

Try adding the following variables: Independence – Supporting Taiwan’s Independence (vs. Unification with China) Econ_worse – Evaluations of economy (Negative) Govt_dont_care – Political Efficacy (Government does not care about people) Minnan_father – Descendent of local Taiwanese Mainland_father – Descendent of mainland China (migrated from mainland circa or after 1949) Taiwanese – Self-identified Taiwanese The model with all variables has an AIC of 793 which is the smallest AIC of all three, suggesting that this is the best model for predicting votetsai.

glm.vt3 <- glm(votetsai~female + KMT + DPP + age + edu + income + Independence + Econ_worse + Govt_dont_care + Minnan_father + Mainland_father + Taiwanese, data=TEDS_2016,family=binomial)
summary(glm.vt3)
## 
## Call:
## glm(formula = votetsai ~ female + KMT + DPP + age + edu + income + 
##     Independence + Econ_worse + Govt_dont_care + Minnan_father + 
##     Mainland_father + Taiwanese, family = binomial, data = TEDS_2016)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1060  -0.3143   0.1744   0.3975   2.7917  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -0.015976   0.679780  -0.024  0.98125    
## female          -0.097996   0.189840  -0.516  0.60571    
## KMT             -2.922246   0.259333 -11.268  < 2e-16 ***
## DPP              2.468855   0.275350   8.966  < 2e-16 ***
## age              0.003287   0.007884   0.417  0.67672    
## edu             -0.092110   0.090119  -1.022  0.30674    
## income           0.021771   0.036406   0.598  0.54984    
## Independence     1.020953   0.251776   4.055 5.01e-05 ***
## Econ_worse       0.310462   0.189100   1.642  0.10063    
## Govt_dont_care  -0.014295   0.188765  -0.076  0.93964    
## Minnan_father   -0.247650   0.253921  -0.975  0.32941    
## Mainland_father -1.089332   0.396822  -2.745  0.00605 ** 
## Taiwanese        0.909019   0.198930   4.570 4.89e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1661.76  on 1256  degrees of freedom
## Residual deviance:  767.13  on 1244  degrees of freedom
##   (433 observations deleted due to missingness)
## AIC: 793.13
## 
## Number of Fisher Scoring iterations: 6