Submission info—>
Homework #2
Ben Peloquin
SUNetID: bpeloqui
Exercises submitted:
Applied:
Unsupervised learning section 10.7 #10
Cross Validation section 5.4 #8
library(ISLR)
#Applied ---------------------------------->
#Section 5.4
#Q8)
#simulated data-set
#a)--------------->
set.seed(1)
y = rnorm(100)
x = rnorm(100)
y = x-(2*(x^2)) + rnorm(100)
# n is 100 (the number of observations)
# p is 1 the number of parameters (x)
# model is:
# y = -2*(x^2) + x + e
# e = rnorm(100)
#b)--------------->
plot(x, y, main="Scatterplot x against y",
xlab="x", ylab="y", col="blue", pch=10)
# this plot makes sense because we have
# a quadratic with negative coefiicent
#c)--------------->
library(boot)
#random seed 10
set.seed(10)
#data container for x, y
y = rnorm(100)
x = rnorm(100)
y = x-(2*(x^2)) + rnorm(100)
df = data.frame(x, y)
cv.error = rep(0, 4)
for (i in 1:5) {
glm.fit=glm(y~poly(x, i), data=df)
cv.error[i] = cv.glm(df, glm.fit)$delta[1]
}
print("random seed 10: ")
## [1] "random seed 10: "
cv.error
## [1] 5.1701184 0.9728969 0.9828663 1.1683921 1.0216840
#d)--------------->
#random seed 11
set.seed(11)
y = rnorm(100)
x = rnorm(100)
y = x-(2*(x^2)) + rnorm(100)
df = data.frame(x, y)
cv.error = rep(0, 4)
for (i in 1:5) {
glm.fit=glm(y~poly(x, i), data=df)
cv.error[i] = cv.glm(df, glm.fit)$delta[1]
}
print("random seed 11: ")
## [1] "random seed 11: "
cv.error
## [1] 9.1498243 0.9556705 0.9908358 0.9838681 0.9943875
# The error terms differ slighlty because we are training on different data sets (changes we we change setseed())
# However the findings remain - it appears that a quadratic model gives us the greatest
# improvement (high-order polynomials aren't warrented here). Of course the generating function is quadratic so no suprise
#e)--------------->
# the smallest error with setseed(10) and setseed(11) is the quadratic which makes sense given that the generating function is quadratic
for (i in 1:4) {
lm.n = lm(y~poly(x, i), data=df)
print(summary(lm.n))
}
##
## Call:
## lm(formula = y ~ poly(x, i), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5690 -1.2253 0.8467 1.7532 3.5854
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.8736 0.2904 -6.451 4.23e-09 ***
## poly(x, i) 5.2469 2.9042 1.807 0.0739 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.904 on 98 degrees of freedom
## Multiple R-squared: 0.03223, Adjusted R-squared: 0.02236
## F-statistic: 3.264 on 1 and 98 DF, p-value: 0.07388
##
##
## Call:
## lm(formula = y ~ poly(x, i), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.06943 -0.67500 -0.00682 0.68415 2.68220
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.87359 0.09645 -19.43 <2e-16 ***
## poly(x, i)1 5.24695 0.96455 5.44 4e-07 ***
## poly(x, i)2 -27.13553 0.96455 -28.13 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9645 on 97 degrees of freedom
## Multiple R-squared: 0.8943, Adjusted R-squared: 0.8922
## F-statistic: 410.5 on 2 and 97 DF, p-value: < 2.2e-16
##
##
## Call:
## lm(formula = y ~ poly(x, i), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.05122 -0.67462 -0.01225 0.67659 2.66276
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.87359 0.09693 -19.329 < 2e-16 ***
## poly(x, i)1 5.24695 0.96934 5.413 4.56e-07 ***
## poly(x, i)2 -27.13553 0.96934 -27.994 < 2e-16 ***
## poly(x, i)3 -0.20272 0.96934 -0.209 0.835
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9693 on 96 degrees of freedom
## Multiple R-squared: 0.8944, Adjusted R-squared: 0.8911
## F-statistic: 271 on 3 and 96 DF, p-value: < 2.2e-16
##
##
## Call:
## lm(formula = y ~ poly(x, i), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.02238 -0.65812 -0.01323 0.66987 2.70371
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.87359 0.09626 -19.464 < 2e-16 ***
## poly(x, i)1 5.24695 0.96257 5.451 3.95e-07 ***
## poly(x, i)2 -27.13553 0.96257 -28.191 < 2e-16 ***
## poly(x, i)3 -0.20272 0.96257 -0.211 0.834
## poly(x, i)4 1.47721 0.96257 1.535 0.128
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9626 on 95 degrees of freedom
## Multiple R-squared: 0.8969, Adjusted R-squared: 0.8926
## F-statistic: 206.7 on 4 and 95 DF, p-value: < 2.2e-16
# it's clear that the coefs for increasingly higher order regressors become statisticall insignificant (stops at 2). These restuls are aligned with our CV findings
#Section 10.7
#A) Generated data set 3 classes, 50 variables, 60 observations --->
set.seed(100)
d=matrix(rnorm(20*3*50), ncol=50)
d[1:20, 1:50]=d[1:20, 1:50]-2
d[21:40, 1:50]=d[21:40, 1:50]+1
d[41:60, 1:50]=d[41:60, 1:50]+4
#B)PCA---------------->
pr.out=prcomp(d, scale=T)
biplot(pr.out, scale=0)
#C)K-means---------------->
km.out=kmeans(d, 3, nstart=10)
plot(d, col=(km.out$cluster+1), main="K-Means Culstering
results with K=3", xlab="", ylab="", pch=20, cex=2)
table(km.out$cluster)
##
## 1 2 3
## 20 20 20
#I'm having trouble figuring out how to extract the labels using table -- when I simply aggregate across the three clusters it's reassuring that kmeans is sorting the observations into buckets of size n=20
km.out=kmeans(d, 2, nstart=10)
plot(d, col=(km.out$cluster+1), main="K-Means Culstering
results with K=2", xlab="", ylab="", pch=20, cex=2)
#This clustering has maintained one of the clusters while combining the other two. This is (relatively) good results because even thought our K is off by one we're still identifying one of the clusters.
km.out=kmeans(d, 4, nstart=10)
plot(d, col=(km.out$cluster+1), main="K-Means Culstering
results with K=4", xlab="", ylab="", pch=20, cex=2)
#Clearly we've overextended the method, the bottom corner suggesting that there are two clusters wehre there clearly apears to be one.
#F)K-means on first to princ components------>
km.out=kmeans(pr.out$rotation[,1:2], 3, nstart=50)
plot(pr.out$center, col=(km.out$cluster+1), main="K-Means Culstering
results with K=4", xlab="", ylab="", pch=20, cex=2)
d.scaled = scale(d)
km.out=kmeans(d.scaled, 3, nstart=50)
plot(pr.out$center, col=(km.out$cluster+1), main="K-Means Culstering on scaled data
results with K=3", xlab="", ylab="", pch=20, cex=2)
#These results are fairly similar to be in that we can clearly start to delineate between the three clusters after scaling on the data, much as we were able to after performing PCA and ploting thie first two principle component score vectors
#Pre work -->
#Cross-validation----------------------->
set.seed(2)
train=sample(392, 196)
length(train)
## [1] 196
#linear model
lm.fit=lm(mpg~horsepower, data=Auto, subset=train)
summary(lm.fit)
##
## Call:
## lm(formula = mpg ~ horsepower, data = Auto, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.9927 -3.3274 -0.3231 2.8258 16.5115
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.263828 0.982438 40.98 <2e-16 ***
## horsepower -0.156543 0.008742 -17.91 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.028 on 194 degrees of freedom
## Multiple R-squared: 0.6231, Adjusted R-squared: 0.6211
## F-statistic: 320.7 on 1 and 194 DF, p-value: < 2.2e-16
attach(Auto)
mean((mpg-predict(lm.fit, Auto))[-train]^2)
## [1] 23.29559
#quadratic
lm.fit2 = lm(mpg~poly(horsepower, 2), data=Auto, subset=train)
summary(lm.fit2)
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 2), data = Auto, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.018 -2.211 0.025 2.292 14.492
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.663 0.317 74.639 < 2e-16 ***
## poly(horsepower, 2)1 -122.624 5.860 -20.927 < 2e-16 ***
## poly(horsepower, 2)2 43.380 5.666 7.657 8.95e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.415 on 193 degrees of freedom
## Multiple R-squared: 0.7109, Adjusted R-squared: 0.7079
## F-statistic: 237.3 on 2 and 193 DF, p-value: < 2.2e-16
mean((mpg-predict(lm.fit2, Auto))[-train]^2)
## [1] 18.90124
#cubic
lm.fit3 = lm(mpg~poly(horsepower, 3), data=Auto, subset=train)
summary(lm.fit3)
##
## Call:
## lm(formula = mpg ~ poly(horsepower, 3), data = Auto, subset = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.0509 -2.1848 0.0193 2.4984 14.5908
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.6703 0.3175 74.544 < 2e-16 ***
## poly(horsepower, 3)1 -123.2397 5.9217 -20.811 < 2e-16 ***
## poly(horsepower, 3)2 42.9374 5.7018 7.531 1.92e-12 ***
## poly(horsepower, 3)3 4.3282 5.6923 0.760 0.448
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.42 on 192 degrees of freedom
## Multiple R-squared: 0.7118, Adjusted R-squared: 0.7073
## F-statistic: 158 on 3 and 192 DF, p-value: < 2.2e-16
mean((mpg-predict(lm.fit3, Auto))[-train]^2)
## [1] 19.2574
#LOOCV----------------------->
glm.fit=glm(mpg~horsepower, data=Auto)
coef(glm.fit)
## (Intercept) horsepower
## 39.9358610 -0.1578447
lm.fit=lm(mpg~horsepower, data=Auto)
coef(lm.fit)
## (Intercept) horsepower
## 39.9358610 -0.1578447
library(boot)
glm.fit=glm(mpg~horsepower, data=Auto)
cv.err = cv.glm(Auto, glm.fit)
cv.err$delta
## [1] 24.23151 24.23114
cv.error = rep(0, 5)
for (i in 1:5) {
glm.fit=glm(mpg~poly(horsepower, i), data=Auto)
cv.error[i] = cv.glm(Auto, glm.fit)$delta[1]
}
cv.error
## [1] 24.23151 19.24821 19.33498 19.42443 19.03321
#k-Fold----------------------->
set.seed(17)
cv.error.10=rep(0, 10)
for (i in 1:10) {
glm.fit=glm(mpg~poly(horsepower, i), data=Auto)
cv.error.10[i]=cv.glm(Auto, glm.fit, K=10)$delta[1]
}
cv.error.10
## [1] 24.20520 19.18924 19.30662 19.33799 18.87911 19.02103 18.89609
## [8] 19.71201 18.95140 19.50196
#Pre-work 10.4 p.401
states=row.names(USArrests)
states
## [1] "Alabama" "Alaska" "Arizona" "Arkansas"
## [5] "California" "Colorado" "Connecticut" "Delaware"
## [9] "Florida" "Georgia" "Hawaii" "Idaho"
## [13] "Illinois" "Indiana" "Iowa" "Kansas"
## [17] "Kentucky" "Louisiana" "Maine" "Maryland"
## [21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
## [25] "Missouri" "Montana" "Nebraska" "Nevada"
## [29] "New Hampshire" "New Jersey" "New Mexico" "New York"
## [33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
## [37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
## [41] "South Dakota" "Tennessee" "Texas" "Utah"
## [45] "Vermont" "Virginia" "Washington" "West Virginia"
## [49] "Wisconsin" "Wyoming"
names(USArrests)
## [1] "Murder" "Assault" "UrbanPop" "Rape"
apply(USArrests, 2, mean)
## Murder Assault UrbanPop Rape
## 7.788 170.760 65.540 21.232
apply(USArrests, 2, var)
## Murder Assault UrbanPop Rape
## 18.97047 6945.16571 209.51878 87.72916
pr.out=prcomp(USArrests, scale=T)
names(pr.out)
## [1] "sdev" "rotation" "center" "scale" "x"
pr.out$center
## Murder Assault UrbanPop Rape
## 7.788 170.760 65.540 21.232
pr.out$scale
## Murder Assault UrbanPop Rape
## 4.355510 83.337661 14.474763 9.366385
pr.out$rotation
## PC1 PC2 PC3 PC4
## Murder -0.5358995 0.4181809 -0.3412327 0.64922780
## Assault -0.5831836 0.1879856 -0.2681484 -0.74340748
## UrbanPop -0.2781909 -0.8728062 -0.3780158 0.13387773
## Rape -0.5434321 -0.1673186 0.8177779 0.08902432
dim(pr.out$x)
## [1] 50 4
biplot(pr.out, scale=0)
pr.out$sdev
## [1] 1.5748783 0.9948694 0.5971291 0.4164494
pr.var=pr.out$sdev^2
pr.var
## [1] 2.4802416 0.9897652 0.3565632 0.1734301
pve=pr.var/sum(pr.var)
pve
## [1] 0.62006039 0.24744129 0.08914080 0.04335752
plot(pve, xlab="Princ Comp",
ylab="prop Var explained",
ylim=c(0,1), type='b')
plot(cumsum(pve), xlab="Principle Comp",
ylab="Cumulative Prop Var explained",
ylim=c(0,1), type='b')
#Pre-work p. 406
set.seed(2)
x=matrix(rnorm(50*2), ncol=2)
x[1:25,1]=x[1:25,1]+3
x[1:25,2]=x[1:25,2]-4
ncol(x)
## [1] 2
?kmeans
km.out=kmeans(x, 2, nstart=20)
km.out$cluster
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
plot(x, col=(km.out$cluster+1), main="K-Means Culstering
results with K=2", xlab=", ylab=", pch=20, cex=2)
set.seed(4)
km.out=kmeans(x, 3, nstart=20)
km.out
## K-means clustering with 3 clusters of sizes 10, 23, 17
##
## Cluster means:
## [,1] [,2]
## 1 2.3001545 -2.69622023
## 2 -0.3820397 -0.08740753
## 3 3.7789567 -4.56200798
##
## Clustering vector:
## [1] 3 1 3 1 3 3 3 1 3 1 3 1 3 1 3 1 3 3 3 3 3 1 3 3 3 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2
##
## Within cluster sum of squares by cluster:
## [1] 19.56137 52.67700 25.74089
## (between_SS / total_SS = 79.3 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
plot(x, col=(km.out$cluster+1), main="K-Means Culstering
results with K=2", xlab=", ylab=", pch=20, cex=2)