# lda example
x = seq(-4,4,l=1000)
f1 = dnorm(x,-1.5,1)
f2 = dnorm(x,1.5,1)
par(mfrow=c(2,2))
# plot 1
pi1 = .5
pi2 = 1-pi1
p1 = f1*pi1
p2 = f2*pi2
tp = p1+p2
p1 = p1/tp
p2 = p2/tp
mymain = paste("pi1 = ",pi1, ", pi2 = ",pi2)
a1= sample(x, size=20)
a2= sample(x,size=20)
data <- data.frame(x = c(a1,a2),
group = c("a1","a2"))
# install.packages("ggplot2")
library("ggplot2")
matplot(x,cbind(f1*pi1,f2*pi2),type='l',ylab='density*pi',
main=mymain)
abline(v=0, col="blue")
ggplot(data, aes(x = x, fill = group)) +
geom_histogram(position = "identity", alpha = 0.2, bins = 10) +
geom_vline(xintercept=0, color="blue")
# commands used in Lecture for Ch 4. NOte that these are NOT the commands
# used in section 4.6 of ISLR (the lab).
library(ISLR)
head(Default)
## default student balance income
## 1 No No 729.5265 44361.625
## 2 No Yes 817.1804 12106.135
## 3 No No 1073.5492 31767.139
## 4 No No 529.2506 35704.494
## 5 No No 785.6559 38463.496
## 6 No Yes 919.5885 7491.559
table(Default$default) # how many observations are there in each class?
##
## No Yes
## 9667 333
# below we create a subsample of points, all with default='yes', and
# an equal number (333) with default='no'
set.seed(101)
mysubset = which(Default$default=='Yes')
mysubset = c(mysubset,sample(which(Default$default=='No'),333))
table(Default$default[mysubset]) # Did we get the subsampling right?
##
## No Yes
## 333 333
#win.graph(1)
#par(mfrow=c(1,3))
plot(Default$balance[mysubset], Default$income[mysubset],
col=Default$default[mysubset],pch=19)
#win.graph(2)
boxplot(Default$balance ~ Default$default,col=Default$balance,pch=19)
#win.graph(3)
boxplot(Default$income ~ Default$default,col=Default$income,pch=19)
# compare a linear model and a logistic regression:
lm1 = lm(I(default=='Yes')~balance,data=Default)
logistic1 = glm(default~balance,data=Default,fam=binomial)
logistic1
##
## Call: glm(formula = default ~ balance, family = binomial, data = Default)
##
## Coefficients:
## (Intercept) balance
## -10.651331 0.005499
##
## Degrees of Freedom: 9999 Total (i.e. Null); 9998 Residual
## Null Deviance: 2921
## Residual Deviance: 1596 AIC: 1600
plot(c(0,2500),c(0,1),type='n',xlab='balance',ylab='Default (1=yes)')
points(Default$balance,predict(lm1))
points(Default$balance,predict(logistic1,type='response'))
# plot a logistic function:
beta = cbind(c(-2,0,2),c(1,2,.5))
x = seq(-4,4,l=500)
plot(range(x),c(0,1))
for (i in 1:nrow(beta)){
numerator = exp(beta[i,1]+beta[i,2]*x)
p = numerator/(1+numerator)
lines(x,p)
}
###############################################
# lda example
x = seq(-4,4,l=1000)
f1 = dnorm(x,-2,1)
f2 = dnorm(x,1,1)
par(mfrow=c(2,2))
# plot 1
pi1 = .5
pi2 = 1-pi1
p1 = f1*pi1
p2 = f2*pi2
tp = p1+p2
p1 = p1/tp
p2 = p2/tp
mymain = paste("pi1 = ",pi1, ", pi2 = ",pi2)
matplot(x,cbind(f1*pi1,f2*pi2),type='l',ylab='density*pi',
main=mymain)
plot(x,p1,type='l',ylab='p(Y=1|x)',main=mymain) # conditional density
# plot 2
pi1 = .8
pi2 = 1-pi1
p1 = f1*pi1
p2 = f2*pi2
tp = p1+p2
p1 = p1/tp
p2 = p2/tp
mymain = paste("pi1 = ",pi1, ", pi2 = ",pi2)
matplot(x,cbind(f1*pi1,f2*pi2),type='l',ylab='density*pi',
main=mymain)
plot(x,p1,type='l',ylab='p(Y=1|x)',main=mymain)
library(MASS)
lda1 = lda(default~balance+student,data=Default)
lda1
## Call:
## lda(default ~ balance + student, data = Default)
##
## Prior probabilities of groups:
## No Yes
## 0.9667 0.0333
##
## Group means:
## balance studentYes
## No 803.9438 0.2914037
## Yes 1747.8217 0.3813814
##
## Coefficients of linear discriminants:
## LD1
## balance 0.002244397
## studentYes -0.249059498
logit1 = glm(default~balance+student,data=Default,fam=binomial)
# see Ch4_Lab.R for detailed codes on prediction
# Refer to Ch4_LogisticReg_EX8_1_STAT3160.R
win.graph()
plot(lda1)
lda.pred=predict(lda1, Default)
lda.class=lda.pred$class
table(lda.class,Default$default)
##
## lda.class No Yes
## No 9644 252
## Yes 23 81
## Creating new variable with 1 if mpg> median(mpg) o if mpg<median(mpg)
attach(Auto)
## The following object is masked from package:ggplot2:
##
## mpg
mpg01 <- rep(0, length(mpg))
mpg01[mpg > median(mpg)] <- 1
Auto <- data.frame(Auto, mpg01)
head(Auto)
## mpg cylinders displacement horsepower weight acceleration year origin
## 1 18 8 307 130 3504 12.0 70 1
## 2 15 8 350 165 3693 11.5 70 1
## 3 18 8 318 150 3436 11.0 70 1
## 4 16 8 304 150 3433 12.0 70 1
## 5 17 8 302 140 3449 10.5 70 1
## 6 15 8 429 198 4341 10.0 70 1
## name mpg01
## 1 chevrolet chevelle malibu 0
## 2 buick skylark 320 0
## 3 plymouth satellite 0
## 4 amc rebel sst 0
## 5 ford torino 0
## 6 ford galaxie 500 0
## investigate the association between mpg01 and the other features
cor(Auto[, -9])
## mpg cylinders displacement horsepower weight
## mpg 1.0000000 -0.7776175 -0.8051269 -0.7784268 -0.8322442
## cylinders -0.7776175 1.0000000 0.9508233 0.8429834 0.8975273
## displacement -0.8051269 0.9508233 1.0000000 0.8972570 0.9329944
## horsepower -0.7784268 0.8429834 0.8972570 1.0000000 0.8645377
## weight -0.8322442 0.8975273 0.9329944 0.8645377 1.0000000
## acceleration 0.4233285 -0.5046834 -0.5438005 -0.6891955 -0.4168392
## year 0.5805410 -0.3456474 -0.3698552 -0.4163615 -0.3091199
## origin 0.5652088 -0.5689316 -0.6145351 -0.4551715 -0.5850054
## mpg01 0.8369392 -0.7591939 -0.7534766 -0.6670526 -0.7577566
## acceleration year origin mpg01
## mpg 0.4233285 0.5805410 0.5652088 0.8369392
## cylinders -0.5046834 -0.3456474 -0.5689316 -0.7591939
## displacement -0.5438005 -0.3698552 -0.6145351 -0.7534766
## horsepower -0.6891955 -0.4163615 -0.4551715 -0.6670526
## weight -0.4168392 -0.3091199 -0.5850054 -0.7577566
## acceleration 1.0000000 0.2903161 0.2127458 0.3468215
## year 0.2903161 1.0000000 0.1815277 0.4299042
## origin 0.2127458 0.1815277 1.0000000 0.5136984
## mpg01 0.3468215 0.4299042 0.5136984 1.0000000
pairs(Auto)
# "Cylinders vs mpg01"
boxplot(cylinders ~ mpg01, data = Auto)
# "Displacement vs mpg01"
boxplot(displacement ~ mpg01, data = Auto)
# "Horsepower vs mpg01"
boxplot(horsepower ~ mpg01, data = Auto)
# "Weight vs mpg01"
boxplot(weight ~ mpg01, data = Auto)
# "Acceleration vs mpg01"
boxplot(acceleration ~ mpg01, data = Auto)
# "Year vs mpg01"
boxplot(year ~ mpg01, data = Auto)
## (c) Split the data into a training set and a test set.
train <- (year %% 2 == 0)
Auto.train <- Auto[train, ]
Auto.test <- Auto[!train, ]
mpg01.test <- mpg01[!train]
## (D) Perform LDA on the training data in order to predict mpg01 using the variables that seemed most associated with
## mpg01 in (b). What is the test error of the model obtained?
lda <- lda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto, subset = train)
lda
## Call:
## lda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto,
## subset = train)
##
## Prior probabilities of groups:
## 0 1
## 0.4571429 0.5428571
##
## Group means:
## cylinders weight displacement horsepower
## 0 6.812500 3604.823 271.7396 133.14583
## 1 4.070175 2314.763 111.6623 77.92105
##
## Coefficients of linear discriminants:
## LD1
## cylinders -0.6741402638
## weight -0.0011465750
## displacement 0.0004481325
## horsepower 0.0059035377
## Prediction of LDA
pred.lda <- predict(lda, Auto.test)
table(pred.lda$class, mpg01.test)
## mpg01.test
## 0 1
## 0 86 9
## 1 14 73
## Mean (Test Error)
mean(pred.lda$class != mpg01.test)
## [1] 0.1263736
## QDA
qda <- qda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto, subset = train)
qda
## Call:
## qda(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto,
## subset = train)
##
## Prior probabilities of groups:
## 0 1
## 0.4571429 0.5428571
##
## Group means:
## cylinders weight displacement horsepower
## 0 6.812500 3604.823 271.7396 133.14583
## 1 4.070175 2314.763 111.6623 77.92105
## Prediction of QDA
pred.qda <- predict(qda, Auto.test)
table(pred.qda$class, mpg01.test)
## mpg01.test
## 0 1
## 0 89 13
## 1 11 69
## Mean of QDA
mean(pred.qda$class != mpg01.test)
## [1] 0.1318681
## Logistic Regression
fit1<- glm(mpg01 ~ cylinders + weight + displacement + horsepower, data = Auto, family = binomial, subset = train)
summary(fit1)
##
## Call:
## glm(formula = mpg01 ~ cylinders + weight + displacement + horsepower,
## family = binomial, data = Auto, subset = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.48027 -0.03413 0.10583 0.29634 2.57584
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 17.658730 3.409012 5.180 2.22e-07 ***
## cylinders -1.028032 0.653607 -1.573 0.1158
## weight -0.002922 0.001137 -2.569 0.0102 *
## displacement 0.002462 0.015030 0.164 0.8699
## horsepower -0.050611 0.025209 -2.008 0.0447 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 289.58 on 209 degrees of freedom
## Residual deviance: 83.24 on 205 degrees of freedom
## AIC: 93.24
##
## Number of Fisher Scoring iterations: 7
## Prediction of Logistic Regression on test data
probs <- predict(fit1, Auto.test, type = "response")
pred.glm <- rep(0, length(probs))
pred.glm[probs > 0.5] <- 1
table(pred.glm, mpg01.test)
## mpg01.test
## pred.glm 0 1
## 0 89 11
## 1 11 71
## Mean of Logistics Regression (Test Error)
mean(pred.glm != mpg01.test)
## [1] 0.1208791
## KNN
# install.packages("class")
library(class)
train.X <- cbind(cylinders, weight, displacement, horsepower)[train, ]
test.X <- cbind(cylinders, weight, displacement, horsepower)[!train, ]
train.mpg01 <- mpg01[train]
set.seed(1)
pred.knn <- knn(train.X, test.X, train.mpg01, k = 1)
table(pred.knn, mpg01.test)
## mpg01.test
## pred.knn 0 1
## 0 83 11
## 1 17 71
## KNN Prediction
mean(pred.knn != mpg01.test)
## [1] 0.1538462
Power <- function() {
2^3
}
Power()
## [1] 8
Power2 <- function(x, a) {
x^a
}
Power2(5, 3)
## [1] 125
Power2(10, 3)
## [1] 1000
Power2(8, 17)
## [1] 2.2518e+15
Power2(131, 3)
## [1] 2248091
Power3 <- function(x , a) {
result <- x^a
return(result)
}
x <- 1:10
plot(x, Power3(x, 2), log = "xy", xlab = "Log of x", ylab = "Log of x^2", main = "Log of x^2 vs Log of x")
## (F)
PlotPower <- function(x, a) {
plot(x, Power3(x, a))
}
PlotPower(1:10, 3)