Submission info—>

Homework #2

Ben Peloquin

SUNetID: bpeloqui

Exercises submitted:

Applied:

Unsupervised learning section 10.7 #10

Cross Validation section 5.4 #8
library(ISLR)
#Applied ---------------------------------->
#Section 5.4 

#Q8)
#simulated data-set

#a)--------------->
set.seed(1)
y = rnorm(100)
x = rnorm(100)
y = x-(2*(x^2)) + rnorm(100)
# n is 100 (the number of observations)
# p is 1 the number of parameters (x)
# model is:
# y = -2*(x^2) + x + e
# e = rnorm(100)

#b)--------------->
plot(x, y, main="Scatterplot x against y",
     xlab="x", ylab="y", col="blue", pch=10)

# this plot makes sense because we have 
# a quadratic with negative coefiicent

#c)--------------->
library(boot)
#random seed 10
set.seed(10)

#data container for x, y
y = rnorm(100)
x = rnorm(100)
y = x-(2*(x^2)) + rnorm(100)
df = data.frame(x, y)
cv.error = rep(0, 4)
for (i in 1:5) {
  glm.fit=glm(y~poly(x, i), data=df)
  cv.error[i] = cv.glm(df, glm.fit)$delta[1]
}
print("random seed 10: ")
## [1] "random seed 10: "
cv.error
## [1] 5.1701184 0.9728969 0.9828663 1.1683921 1.0216840
#d)--------------->
#random seed 11
set.seed(11)
y = rnorm(100)
x = rnorm(100)
y = x-(2*(x^2)) + rnorm(100)
df = data.frame(x, y)
cv.error = rep(0, 4)
for (i in 1:5) {
  glm.fit=glm(y~poly(x, i), data=df)
  cv.error[i] = cv.glm(df, glm.fit)$delta[1]
}
print("random seed 11: ")
## [1] "random seed 11: "
cv.error
## [1] 9.1498243 0.9556705 0.9908358 0.9838681 0.9943875
# The error terms differ slighlty because we are training on different data sets (changes we we change setseed())
# However the findings remain - it appears that a quadratic model gives us the greatest 
# improvement (high-order polynomials aren't warrented here). Of course the generating function is quadratic so no suprise

#e)--------------->
# the smallest error with setseed(10) and setseed(11) is the quadratic which makes sense given that the generating function is quadratic

for (i in 1:4) {
  lm.n = lm(y~poly(x, i), data=df)
  print(summary(lm.n))
}
## 
## Call:
## lm(formula = y ~ poly(x, i), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.5690  -1.2253   0.8467   1.7532   3.5854 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.8736     0.2904  -6.451 4.23e-09 ***
## poly(x, i)    5.2469     2.9042   1.807   0.0739 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.904 on 98 degrees of freedom
## Multiple R-squared:  0.03223,    Adjusted R-squared:  0.02236 
## F-statistic: 3.264 on 1 and 98 DF,  p-value: 0.07388
## 
## 
## Call:
## lm(formula = y ~ poly(x, i), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.06943 -0.67500 -0.00682  0.68415  2.68220 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.87359    0.09645  -19.43   <2e-16 ***
## poly(x, i)1   5.24695    0.96455    5.44    4e-07 ***
## poly(x, i)2 -27.13553    0.96455  -28.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9645 on 97 degrees of freedom
## Multiple R-squared:  0.8943, Adjusted R-squared:  0.8922 
## F-statistic: 410.5 on 2 and 97 DF,  p-value: < 2.2e-16
## 
## 
## Call:
## lm(formula = y ~ poly(x, i), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.05122 -0.67462 -0.01225  0.67659  2.66276 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.87359    0.09693 -19.329  < 2e-16 ***
## poly(x, i)1   5.24695    0.96934   5.413 4.56e-07 ***
## poly(x, i)2 -27.13553    0.96934 -27.994  < 2e-16 ***
## poly(x, i)3  -0.20272    0.96934  -0.209    0.835    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9693 on 96 degrees of freedom
## Multiple R-squared:  0.8944, Adjusted R-squared:  0.8911 
## F-statistic:   271 on 3 and 96 DF,  p-value: < 2.2e-16
## 
## 
## Call:
## lm(formula = y ~ poly(x, i), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.02238 -0.65812 -0.01323  0.66987  2.70371 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1.87359    0.09626 -19.464  < 2e-16 ***
## poly(x, i)1   5.24695    0.96257   5.451 3.95e-07 ***
## poly(x, i)2 -27.13553    0.96257 -28.191  < 2e-16 ***
## poly(x, i)3  -0.20272    0.96257  -0.211    0.834    
## poly(x, i)4   1.47721    0.96257   1.535    0.128    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9626 on 95 degrees of freedom
## Multiple R-squared:  0.8969, Adjusted R-squared:  0.8926 
## F-statistic: 206.7 on 4 and 95 DF,  p-value: < 2.2e-16
# it's clear that the coefs for increasingly higher order regressors become statisticall insignificant (stops at 2). These restuls are aligned with our CV findings



#Section 10.7

#A) Generated data set 3 classes, 50 variables, 60 observations --->
set.seed(100)
d=matrix(rnorm(20*3*50), ncol=50)
d[1:20, 1:50]=d[1:20, 1:50]-2
d[21:40, 1:50]=d[21:40, 1:50]+1
d[41:60, 1:50]=d[41:60, 1:50]+4

#B)PCA---------------->
pr.out=prcomp(d, scale=T)
biplot(pr.out, scale=0)

#C)K-means---------------->
km.out=kmeans(d, 3, nstart=10)
plot(d, col=(km.out$cluster+1), main="K-Means Culstering
     results with K=3", xlab="", ylab="", pch=20, cex=2)

table(km.out$cluster)
## 
##  1  2  3 
## 20 20 20
#I'm having trouble figuring out how to extract the labels using table -- when I simply aggregate across the three clusters it's reassuring that kmeans is sorting the observations into buckets of size n=20

km.out=kmeans(d, 2, nstart=10)
plot(d, col=(km.out$cluster+1), main="K-Means Culstering
     results with K=2", xlab="", ylab="", pch=20, cex=2)

#This clustering has maintained one of the clusters while combining the other two. This is (relatively) good results because even thought our K is off by one we're still identifying one of the clusters.

km.out=kmeans(d, 4, nstart=10)
plot(d, col=(km.out$cluster+1), main="K-Means Culstering
     results with K=4", xlab="", ylab="", pch=20, cex=2)

#Clearly we've overextended the method, the bottom corner suggesting that there are two clusters wehre there clearly apears to be one.

#F)K-means on first to princ components------>
km.out=kmeans(pr.out$rotation[,1:2], 3, nstart=50)
plot(pr.out$center, col=(km.out$cluster+1), main="K-Means Culstering
     results with K=4", xlab="", ylab="", pch=20, cex=2)

d.scaled = scale(d)
km.out=kmeans(d.scaled, 3, nstart=50)
plot(pr.out$center, col=(km.out$cluster+1), main="K-Means Culstering on scaled data
     results with K=3", xlab="", ylab="", pch=20, cex=2)

#These results are fairly similar to be in that we can clearly start to delineate between the three clusters after scaling on the data, much as we were able to after performing PCA and ploting thie first two principle component score vectors

#Pre work -->
#Cross-validation----------------------->
set.seed(2)
train=sample(392, 196)
length(train)
## [1] 196
#linear model
lm.fit=lm(mpg~horsepower, data=Auto, subset=train)
summary(lm.fit)
## 
## Call:
## lm(formula = mpg ~ horsepower, data = Auto, subset = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.9927  -3.3274  -0.3231   2.8258  16.5115 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 40.263828   0.982438   40.98   <2e-16 ***
## horsepower  -0.156543   0.008742  -17.91   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.028 on 194 degrees of freedom
## Multiple R-squared:  0.6231, Adjusted R-squared:  0.6211 
## F-statistic: 320.7 on 1 and 194 DF,  p-value: < 2.2e-16
attach(Auto)
mean((mpg-predict(lm.fit, Auto))[-train]^2)
## [1] 23.29559
#quadratic
lm.fit2 = lm(mpg~poly(horsepower, 2), data=Auto, subset=train)
summary(lm.fit2)
## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 2), data = Auto, subset = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -15.018  -2.211   0.025   2.292  14.492 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            23.663      0.317  74.639  < 2e-16 ***
## poly(horsepower, 2)1 -122.624      5.860 -20.927  < 2e-16 ***
## poly(horsepower, 2)2   43.380      5.666   7.657 8.95e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.415 on 193 degrees of freedom
## Multiple R-squared:  0.7109, Adjusted R-squared:  0.7079 
## F-statistic: 237.3 on 2 and 193 DF,  p-value: < 2.2e-16
mean((mpg-predict(lm.fit2, Auto))[-train]^2)
## [1] 18.90124
#cubic
lm.fit3 = lm(mpg~poly(horsepower, 3), data=Auto, subset=train)
summary(lm.fit3)
## 
## Call:
## lm(formula = mpg ~ poly(horsepower, 3), data = Auto, subset = train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -15.0509  -2.1848   0.0193   2.4984  14.5908 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            23.6703     0.3175  74.544  < 2e-16 ***
## poly(horsepower, 3)1 -123.2397     5.9217 -20.811  < 2e-16 ***
## poly(horsepower, 3)2   42.9374     5.7018   7.531 1.92e-12 ***
## poly(horsepower, 3)3    4.3282     5.6923   0.760    0.448    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.42 on 192 degrees of freedom
## Multiple R-squared:  0.7118, Adjusted R-squared:  0.7073 
## F-statistic:   158 on 3 and 192 DF,  p-value: < 2.2e-16
mean((mpg-predict(lm.fit3, Auto))[-train]^2)
## [1] 19.2574
#LOOCV----------------------->
glm.fit=glm(mpg~horsepower, data=Auto)
coef(glm.fit)
## (Intercept)  horsepower 
##  39.9358610  -0.1578447
lm.fit=lm(mpg~horsepower, data=Auto)
coef(lm.fit)
## (Intercept)  horsepower 
##  39.9358610  -0.1578447
library(boot)
glm.fit=glm(mpg~horsepower, data=Auto)
cv.err = cv.glm(Auto, glm.fit)
cv.err$delta
## [1] 24.23151 24.23114
cv.error = rep(0, 5)
for (i in 1:5) {
  glm.fit=glm(mpg~poly(horsepower, i), data=Auto)
  cv.error[i] = cv.glm(Auto, glm.fit)$delta[1]
}
cv.error
## [1] 24.23151 19.24821 19.33498 19.42443 19.03321
#k-Fold----------------------->
set.seed(17)
cv.error.10=rep(0, 10)
for (i in 1:10) {
  glm.fit=glm(mpg~poly(horsepower, i), data=Auto)
  cv.error.10[i]=cv.glm(Auto, glm.fit, K=10)$delta[1]
}
cv.error.10
##  [1] 24.20520 19.18924 19.30662 19.33799 18.87911 19.02103 18.89609
##  [8] 19.71201 18.95140 19.50196
#Pre-work 10.4 p.401
states=row.names(USArrests)
states
##  [1] "Alabama"        "Alaska"         "Arizona"        "Arkansas"      
##  [5] "California"     "Colorado"       "Connecticut"    "Delaware"      
##  [9] "Florida"        "Georgia"        "Hawaii"         "Idaho"         
## [13] "Illinois"       "Indiana"        "Iowa"           "Kansas"        
## [17] "Kentucky"       "Louisiana"      "Maine"          "Maryland"      
## [21] "Massachusetts"  "Michigan"       "Minnesota"      "Mississippi"   
## [25] "Missouri"       "Montana"        "Nebraska"       "Nevada"        
## [29] "New Hampshire"  "New Jersey"     "New Mexico"     "New York"      
## [33] "North Carolina" "North Dakota"   "Ohio"           "Oklahoma"      
## [37] "Oregon"         "Pennsylvania"   "Rhode Island"   "South Carolina"
## [41] "South Dakota"   "Tennessee"      "Texas"          "Utah"          
## [45] "Vermont"        "Virginia"       "Washington"     "West Virginia" 
## [49] "Wisconsin"      "Wyoming"
names(USArrests)
## [1] "Murder"   "Assault"  "UrbanPop" "Rape"
apply(USArrests, 2, mean)
##   Murder  Assault UrbanPop     Rape 
##    7.788  170.760   65.540   21.232
apply(USArrests, 2, var)
##     Murder    Assault   UrbanPop       Rape 
##   18.97047 6945.16571  209.51878   87.72916
pr.out=prcomp(USArrests, scale=T)
names(pr.out)
## [1] "sdev"     "rotation" "center"   "scale"    "x"
pr.out$center
##   Murder  Assault UrbanPop     Rape 
##    7.788  170.760   65.540   21.232
pr.out$scale
##    Murder   Assault  UrbanPop      Rape 
##  4.355510 83.337661 14.474763  9.366385
pr.out$rotation 
##                 PC1        PC2        PC3         PC4
## Murder   -0.5358995  0.4181809 -0.3412327  0.64922780
## Assault  -0.5831836  0.1879856 -0.2681484 -0.74340748
## UrbanPop -0.2781909 -0.8728062 -0.3780158  0.13387773
## Rape     -0.5434321 -0.1673186  0.8177779  0.08902432
dim(pr.out$x)
## [1] 50  4
biplot(pr.out, scale=0)

pr.out$sdev
## [1] 1.5748783 0.9948694 0.5971291 0.4164494
pr.var=pr.out$sdev^2
pr.var
## [1] 2.4802416 0.9897652 0.3565632 0.1734301
pve=pr.var/sum(pr.var)
pve
## [1] 0.62006039 0.24744129 0.08914080 0.04335752
plot(pve, xlab="Princ Comp",
     ylab="prop Var explained", 
     ylim=c(0,1), type='b')

plot(cumsum(pve), xlab="Principle Comp",
     ylab="Cumulative Prop Var explained",
     ylim=c(0,1), type='b')

#Pre-work p. 406
set.seed(2)
x=matrix(rnorm(50*2), ncol=2)
x[1:25,1]=x[1:25,1]+3
x[1:25,2]=x[1:25,2]-4

ncol(x)
## [1] 2
?kmeans
km.out=kmeans(x, 2, nstart=20)
km.out$cluster
##  [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
plot(x, col=(km.out$cluster+1), main="K-Means Culstering
     results with K=2", xlab=", ylab=", pch=20, cex=2)

set.seed(4)
km.out=kmeans(x, 3, nstart=20)
km.out
## K-means clustering with 3 clusters of sizes 10, 23, 17
## 
## Cluster means:
##         [,1]        [,2]
## 1  2.3001545 -2.69622023
## 2 -0.3820397 -0.08740753
## 3  3.7789567 -4.56200798
## 
## Clustering vector:
##  [1] 3 1 3 1 3 3 3 1 3 1 3 1 3 1 3 1 3 3 3 3 3 1 3 3 3 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 1 2 1 2 2 2 2
## 
## Within cluster sum of squares by cluster:
## [1] 19.56137 52.67700 25.74089
##  (between_SS / total_SS =  79.3 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"
plot(x, col=(km.out$cluster+1), main="K-Means Culstering
     results with K=2", xlab=", ylab=", pch=20, cex=2)