Regression Week 4: Ridge Regression Assignment 1
1. Loading data sets and packages
library(ggplot2);library(gridExtra);library(plyr)
library(MASS);library(knitr)
setwd('F:/Machine Learning washington/WEEK4/project1')
dat=read.csv("kc_house_data.csv")
train=read.csv("wk3_kc_house_train_data.csv")
test=read.csv("wk3_kc_house_test_data.csv")
valid=read.csv("wk3_kc_house_valid_data.csv")
shuffle=read.csv("wk3_kc_house_train_valid_shuffled.csv")
set1=read.csv("wk3_kc_house_set_1_data.csv")
set2=read.csv("wk3_kc_house_set_2_data.csv")
set3=read.csv("wk3_kc_house_set_3_data.csv")
set4=read.csv("wk3_kc_house_set_4_data.csv")
2. Define functions**
# Define a function to add features
polyfeature <- function(degree,feature){
# degree is the max power; feature is a column in a dataframe.
if(floor(degree)!=degree){
print("The degree you input is not a qualified integer!???T_T")
}
else {
array = as.data.frame( matrix(0,length(feature),degree) )
for(i in c(1:degree)){
array[,i]= feature^i
colnames(array)[i]=paste('power',i)
}
}
return(array)
}
# Define a function to normalize features
normal <- function(feature){
max=max(feature)
min=min(feature)
feature=(feature-min)/(max-min)
}
3. Scatter plot: price vs sqft(the dat)
dat=arrange(dat,sqft_living)
pic=ggplot(dat,aes(sqft_living,price))+geom_point(pch=5,col='lightblue')
pic=pic+ggtitle('Housing prices vs square feet')+xlab('square feet')
pic

4. Add penalty to 15th polynomial model
# Total cost=RSS + L2*T(W)*W
# TINU REGULARIZATION
L2 = 1.5e-5
X=as.matrix( polyfeature(15,dat$sqft_living) )
model=lm.ridge(dat$price~X,lambda = L2)
round(coef(model)[2],2)
## Xpower 1
## 136.66
5. Biulding 15th polynomial model among set1-4
# Generally, whenever we see weights change so much in response to change in data, we believe the variance of our estimate to be large.
# Comparison between small and large penalty
# Biulding 15th polynomial model among set1-4
for(L in c(1e-9,1.23e2)){
set1=arrange(set1,sqft_living)
set2=arrange(set2,sqft_living)
set3=arrange(set3,sqft_living)
set4=arrange(set4,sqft_living)
X1=as.matrix(polyfeature(15,set1$sqft_living))
X2=as.matrix(polyfeature(15,set2$sqft_living))
X3=as.matrix(polyfeature(15,set3$sqft_living))
X4=as.matrix(polyfeature(15,set4$sqft_living))
fit1=lm.ridge(set1$price~X1,lambda = L)
fit2=lm.ridge(set2$price~X2,lambda = L)
fit3=lm.ridge(set3$price~X3,lambda = L)
fit4=lm.ridge(set4$price~X4,lambda = L)
Yhat1=cbind(1,X1) %*% coef(fit1) ; Yhat2=cbind(1,X2) %*% coef(fit2)
Yhat3=cbind(1,X3) %*% coef(fit3) ; Yhat4=cbind(1,X4) %*% coef(fit4)
g1=ggplot(set1)+geom_point(aes(sqft_living,set1$price),color='lightblue',pch=5)
g1=g1+geom_point(aes(set1$sqft_living,Yhat1),color='pink',pch=3)
g1=g1+ggtitle("Subset 1")+ylab(paste('penalty',L))
g2=ggplot(set2)+geom_point(aes(sqft_living,set2$price),color='lightblue',pch=5)
g2=g2+geom_point(aes(X2[,1],Yhat2),color='pink',pch=3)
g2=g2+ggtitle("Subset 2")
g3=ggplot(set3)+geom_point(aes(sqft_living,set3$price),color='lightblue',pch=5)
g3=g3+geom_point(aes(X3[,1],Yhat3),color='pink',pch=3)
g3=g3+ggtitle("Subset 3")
g4=ggplot(set4)+geom_point(aes(sqft_living,set4$price),color='lightblue',pch=5)
g4=g4+geom_point(aes(X4[,1],Yhat4),color='pink',pch=3)
g4=g4+ggtitle("Subet 4")
t=round( data.frame(coef(fit1)[2],coef(fit2)[2], coef(fit3)[2],coef(fit4)[2]),2)
rownames(t)='Coefficients of SquareFeet'
colnames(t)=c('fit1','fit2','fit3','fit4')
grid.arrange(g1,g2,g3,g4,ncol=4)
table=data.frame(Lamda=format(L,scientific=T),min=min(t),max=max(t))
print(kable(t))
print(kable(table))
}

##
##
## fit1 fit2 fit3 fit4
## --------------------------- ------- --------- -------- --------
## Coefficients of SquareFeet 797.27 -4235.44 2944.53 -766.61
##
##
## Lamda min max
## ------ --------- --------
## 1e-09 -4235.44 2944.53

##
##
## fit1 fit2 fit3 fit4
## --------------------------- ------- ------ ------ ------
## Coefficients of SquareFeet 111.68 76.95 89.92 79.66
##
##
## Lamda min max
## --------- ------ -------
## 1.23e+02 76.95 111.68
6. How to select a penalty?
# Step1 K-fold segments
# segment i starts at index (n*i/k) and ends at (n*(i+1)/k)-1.
n=dim(shuffle)[1]
k=10
kfold= matrix(0, 3,k)
for (i in c(1:k)){
start=floor(n*(i-1)/k+1)
end=floor(n*(i)/k)
kfold[,i]=rbind(i,start,end)
}
kfold
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1 2 3 4 5 6 7 8 9 10
## [2,] 1 1940 3880 5819 7759 9699 11638 13578 15517 17457
## [3,] 1939 3879 5818 7758 9698 11637 13577 15516 17456 19396
# Step2 Cross validation among K FOLDS
RSS=matrix(0,1,k)
exp=seq(3,9,0.5)
Lamda=10^exp
CVrss=matrix(0,1,length(Lamda))
for( j in c(1:length(Lamda)) ){
L=Lamda[j]
for (i in c(1:k)){
valid=shuffle[c(kfold[2,i]:kfold[3,i]),]
valid=arrange(valid,sqft_living)
train=shuffle[-c(kfold[2,i]:kfold[3,i]),]
train=arrange(train,sqft_living)
X=as.matrix( polyfeature(15,train$sqft_living) )
Y=train$price ; fit=lm.ridge(Y~X,lambda = L)
Xvalid= as.matrix( polyfeature(15,valid$sqft_living) )
Yhat=cbind(1,Xvalid) %*% coef(fit)
RSS[i]=sum((valid$price-Yhat)^2)
}
CVrss[,j]=mean(RSS)
}
CVrss=format(CVrss)
which(CVrss==min(CVrss))
## [1] 1
primeL=Lamda[which(CVrss==min(CVrss))]
primeL
## [1] 1000
7. Use the lamda to train our training set
train=arrange(train,sqft_living)
x=train$sqft_living
X=as.matrix( polyfeature(15,x) )
Y=train$price
fit=lm.ridge(Y~X,lambda = primeL)
Xtest=as.matrix( polyfeature(15,test$sqft_living))
Yhat=cbind(1,Xtest) %*% coef(fit)
RSS=sum((test$price-Yhat)^2)
RSS
## [1] 1.371019e+14