Logistic Regression, Linear Discriminant Analysis, Quadratic Discriminant Analysis

Qi)

# lda example
x = seq(-4,4,l=1000)
f1 = dnorm(x,-1.5,1)
f2 = dnorm(x,1.5,1)
par(mfrow=c(2,2))
# plot 1
pi1 = .5
pi2 = 1-pi1
p1 = f1*pi1
p2 = f2*pi2
tp = p1+p2
p1 = p1/tp
p2 = p2/tp
mymain = paste("pi1 = ",pi1, ", pi2 = ",pi2)







a1= sample(x, size=20)
a2= sample(x,size=20)



 
data <- data.frame(x = c(a1,a2),
                   group = c("a1","a2"))

# install.packages("ggplot2")                                       
library("ggplot2")




matplot(x,cbind(f1*pi1,f2*pi2),type='l',ylab='density*pi',
        main=mymain)
abline(v=0, col="blue")

ggplot(data, aes(x = x, fill = group)) +                       
  geom_histogram(position = "identity", alpha = 0.2, bins = 10) +
  geom_vline(xintercept=0, color="blue")

Qii)

# commands used in Lecture for Ch 4. NOte that these are NOT the commands
# used in section 4.6 of ISLR (the lab).  

library(ISLR)
head(Default)

##   default student   balance    income
## 1      No      No  729.5265 44361.625
## 2      No     Yes  817.1804 12106.135
## 3      No      No 1073.5492 31767.139
## 4      No      No  529.2506 35704.494
## 5      No      No  785.6559 38463.496
## 6      No     Yes  919.5885  7491.559

table(Default$default) # how many observations are there in each class?

## 
##   No  Yes 
## 9667  333

# below we create a subsample of points, all with default='yes', and 
# an equal number (333) with default='no'
set.seed(101)
mysubset = which(Default$default=='Yes') 
mysubset = c(mysubset,sample(which(Default$default=='No'),333))
table(Default$default[mysubset]) # Did we get the subsampling right?

## 
##  No Yes 
## 333 333

#win.graph(1)
#par(mfrow=c(1,3))
plot(Default$balance[mysubset], Default$income[mysubset],
     col=Default$default[mysubset],pch=19)

#win.graph(2)
boxplot(Default$balance ~ Default$default,col=Default$balance,pch=19)

#win.graph(3)
boxplot(Default$income ~ Default$default,col=Default$income,pch=19)

# compare a linear model and a logistic regression:
lm1 = lm(I(default=='Yes')~balance,data=Default)
logistic1 = glm(default~balance,data=Default,fam=binomial)
logistic1

## 
## Call:  glm(formula = default ~ balance, family = binomial, data = Default)
## 
## Coefficients:
## (Intercept)      balance  
##  -10.651331     0.005499  
## 
## Degrees of Freedom: 9999 Total (i.e. Null);  9998 Residual
## Null Deviance:       2921 
## Residual Deviance: 1596  AIC: 1600

plot(c(0,2500),c(0,1),type='n',xlab='balance',ylab='Default (1=yes)')
points(Default$balance,predict(lm1))
points(Default$balance,predict(logistic1,type='response'))

# plot a logistic function:
beta = cbind(c(-2,0,2),c(1,2,.5))
x = seq(-4,4,l=500)
plot(range(x),c(0,1))
for (i in 1:nrow(beta)){
  numerator = exp(beta[i,1]+beta[i,2]*x)
  p = numerator/(1+numerator)
  lines(x,p)
}

###############################################
# lda example
x = seq(-4,4,l=1000)
f1 = dnorm(x,-2,1)
f2 = dnorm(x,1,1)
par(mfrow=c(2,2))
# plot 1
pi1 = .5
pi2 = 1-pi1
p1 = f1*pi1
p2 = f2*pi2
tp = p1+p2
p1 = p1/tp
p2 = p2/tp
mymain = paste("pi1 = ",pi1, ", pi2 = ",pi2)
matplot(x,cbind(f1*pi1,f2*pi2),type='l',ylab='density*pi',
        main=mymain)
plot(x,p1,type='l',ylab='p(Y=1|x)',main=mymain)  # conditional density
# plot 2
pi1 = .8
pi2 = 1-pi1
p1 = f1*pi1
p2 = f2*pi2
tp = p1+p2
p1 = p1/tp
p2 = p2/tp
mymain = paste("pi1 = ",pi1, ", pi2 = ",pi2)
matplot(x,cbind(f1*pi1,f2*pi2),type='l',ylab='density*pi',
        main=mymain)
plot(x,p1,type='l',ylab='p(Y=1|x)',main=mymain)

library(MASS)
lda1 = lda(default~balance+student,data=Default)
lda1

## Call:
## lda(default ~ balance + student, data = Default)
## 
## Prior probabilities of groups:
##     No    Yes 
## 0.9667 0.0333 
## 
## Group means:
##       balance studentYes
## No   803.9438  0.2914037
## Yes 1747.8217  0.3813814
## 
## Coefficients of linear discriminants:
##                     LD1
## balance     0.002244397
## studentYes -0.249059498

logit1 = glm(default~balance+student,data=Default,fam=binomial)
# see Ch4_Lab.R for detailed codes on prediction
# Refer to Ch4_LogisticReg_EX8_1_STAT3160.R 

win.graph()
plot(lda1)

lda.pred=predict(lda1, Default)
lda.class=lda.pred$class
table(lda.class,Default$default)

##          
## lda.class   No  Yes
##       No  9644  252
##       Yes   23   81

11)

## Creating new variable with 1 if mpg> median(mpg) o if mpg<median(mpg)

attach(Auto)

## The following object is masked from package:ggplot2:
## 
##     mpg

mpg01 <- rep(0, length(mpg))
mpg01[mpg > median(mpg)] <- 1
Auto <- data.frame(Auto, mpg01)

head(Auto)

##   mpg cylinders displacement horsepower weight acceleration year origin
## 1  18         8          307        130   3504         12.0   70      1
## 2  15         8          350        165   3693         11.5   70      1
## 3  18         8          318        150   3436         11.0   70      1
## 4  16         8          304        150   3433         12.0   70      1
## 5  17         8          302        140   3449         10.5   70      1
## 6  15         8          429        198   4341         10.0   70      1
##                        name mpg01
## 1 chevrolet chevelle malibu     0
## 2         buick skylark 320     0
## 3        plymouth satellite     0
## 4             amc rebel sst     0
## 5               ford torino     0
## 6          ford galaxie 500     0

## investigate the association between mpg01 and the other features

cor(Auto[, -9])

##                     mpg  cylinders displacement horsepower     weight
## mpg           1.0000000 -0.7776175   -0.8051269 -0.7784268 -0.8322442
## cylinders    -0.7776175  1.0000000    0.9508233  0.8429834  0.8975273
## displacement -0.8051269  0.9508233    1.0000000  0.8972570  0.9329944
## horsepower   -0.7784268  0.8429834    0.8972570  1.0000000  0.8645377
## weight       -0.8322442  0.8975273    0.9329944  0.8645377  1.0000000
## acceleration  0.4233285 -0.5046834   -0.5438005 -0.6891955 -0.4168392
## year          0.5805410 -0.3456474   -0.3698552 -0.4163615 -0.3091199
## origin        0.5652088 -0.5689316   -0.6145351 -0.4551715 -0.5850054
## mpg01         0.8369392 -0.7591939   -0.7534766 -0.6670526 -0.7577566
##              acceleration       year     origin      mpg01
## mpg             0.4233285  0.5805410  0.5652088  0.8369392
## cylinders      -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement   -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower     -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight         -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration    1.0000000  0.2903161  0.2127458  0.3468215
## year            0.2903161  1.0000000  0.1815277  0.4299042
## origin          0.2127458  0.1815277  1.0000000  0.5136984
## mpg01           0.3468215  0.4299042  0.5136984  1.0000000

pairs(Auto)

# "Cylinders vs mpg01"
boxplot(cylinders ~ mpg01, data = Auto)

# "Displacement vs mpg01"
boxplot(displacement ~ mpg01, data = Auto)

# "Horsepower vs mpg01"
boxplot(horsepower ~ mpg01, data = Auto)

# "Weight vs mpg01"
boxplot(weight ~ mpg01, data = Auto)

#  "Acceleration vs mpg01"
boxplot(acceleration ~ mpg01, data = Auto)

#  "Year vs mpg01"
boxplot(year ~ mpg01, data = Auto)

## (c) Split the data into a training set and a test set.

train <- (year %% 2 == 0)
Auto.train <- Auto[train, ]
Auto.test <- Auto[!train, ]
mpg01.test <- mpg01[!train]

## (D) Perform LDA on the training data in order to predict mpg01 using the variables that seemed most associated with 
## mpg01 in (b). What is the test error of the model obtained?

lda <- lda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto, subset = train)
lda

## Call:
## lda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto, 
##     subset = train)
## 
## Prior probabilities of groups:
##         0         1 
## 0.4571429 0.5428571 
## 
## Group means:
##   cylinders   weight displacement horsepower
## 0  6.812500 3604.823     271.7396  133.14583
## 1  4.070175 2314.763     111.6623   77.92105
## 
## Coefficients of linear discriminants:
##                        LD1
## cylinders    -0.6741402638
## weight       -0.0011465750
## displacement  0.0004481325
## horsepower    0.0059035377

## Prediction of LDA 

pred.lda <- predict(lda, Auto.test)
table(pred.lda$class, mpg01.test)

##    mpg01.test
##      0  1
##   0 86  9
##   1 14 73

 ## Mean (Test Error)
mean(pred.lda$class != mpg01.test)

## [1] 0.1263736

## QDA 

qda <- qda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto, subset = train)
qda

## Call:
## qda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto, 
##     subset = train)
## 
## Prior probabilities of groups:
##         0         1 
## 0.4571429 0.5428571 
## 
## Group means:
##   cylinders   weight displacement horsepower
## 0  6.812500 3604.823     271.7396  133.14583
## 1  4.070175 2314.763     111.6623   77.92105

## Prediction of QDA


pred.qda <- predict(qda, Auto.test)
table(pred.qda$class, mpg01.test)

##    mpg01.test
##      0  1
##   0 89 13
##   1 11 69

## Mean of QDA 
mean(pred.qda$class != mpg01.test)

## [1] 0.1318681

## Logistic Regression


fit1<- glm(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto, family = binomial, subset = train)
summary(fit1)

## 
## Call:
## glm(formula = mpg01 ~ cylinders + weight + displacement + horsepower, 
##     family = binomial, data = Auto, subset = train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.48027  -0.03413   0.10583   0.29634   2.57584  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  17.658730   3.409012   5.180 2.22e-07 ***
## cylinders    -1.028032   0.653607  -1.573   0.1158    
## weight       -0.002922   0.001137  -2.569   0.0102 *  
## displacement  0.002462   0.015030   0.164   0.8699    
## horsepower   -0.050611   0.025209  -2.008   0.0447 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 289.58  on 209  degrees of freedom
## Residual deviance:  83.24  on 205  degrees of freedom
## AIC: 93.24
## 
## Number of Fisher Scoring iterations: 7

## Prediction of Logistic Regression on test data
probs <- predict(fit1, Auto.test, type = "response")
pred.glm <- rep(0, length(probs))
pred.glm[probs > 0.5] <- 1
table(pred.glm, mpg01.test)

##         mpg01.test
## pred.glm  0  1
##        0 89 11
##        1 11 71

## Mean of Logistics Regression (Test Error)

mean(pred.glm != mpg01.test)

## [1] 0.1208791

## KNN
# install.packages("class")
library(class)


train.X <- cbind(cylinders, weight, displacement, horsepower)[train, ]
test.X <- cbind(cylinders, weight, displacement, horsepower)[!train, ]
train.mpg01 <- mpg01[train]
set.seed(1)
pred.knn <- knn(train.X, test.X, train.mpg01, k = 1)
table(pred.knn, mpg01.test)

##         mpg01.test
## pred.knn  0  1
##        0 83 11
##        1 17 71

## KNN Prediction
mean(pred.knn != mpg01.test)

## [1] 0.1538462

Q.12 This problem involves writing functions.

Power <- function() {
    2^3
}

Power()

## [1] 8

(b) Create a new function, Power2(), that allows you to pass any two numbers, x and a,(x^a)

Power2 <- function(x, a) {
    x^a
}

Power2(5, 3)

## [1] 125

(C) Using the Power2() function that you just wrote, compute 10^3, 8^17, and 131^3.

Power2(10, 3)

## [1] 1000

Power2(8, 17)

## [1] 2.2518e+15

Power2(131, 3)

## [1] 2248091

(D) Power3(), that actually returns the result “x^a” as an R object, rather than simply printing it to the screen

Power3 <- function(x , a) {
    result <- x^a
    return(result)
}

(E)

x <- 1:10
plot(x, Power3(x, 2), log = "xy", xlab = "Log of x", ylab = "Log of x^2", main = "Log of x^2 vs Log of x")

## (F)

PlotPower <- function(x, a) {
    plot(x, Power3(x, a))
}

PlotPower(1:10, 3)

Logistic Regression, Linear Discriminant Analysis, Quadratic Discriminant Analysis

Shubham Suryawanshi

2022-10-28

Qi)

Qii)

11)

Q.12 This problem involves writing functions.

(b) Create a new function, Power2(), that allows you to pass any two numbers, x and a,(x^a)

(C) Using the Power2() function that you just wrote, compute 10^3, 8^17, and 131^3.

(D) Power3(), that actually returns the result “x^a” as an R object, rather than simply printing it to the screen

(E)