Regression Week 4: Ridge Regression Assignment 1

1. Loading data sets and packages

library(ggplot2);library(gridExtra);library(plyr)
library(MASS);library(knitr)
setwd('F:/Machine Learning washington/WEEK4/project1')
dat=read.csv("kc_house_data.csv")
train=read.csv("wk3_kc_house_train_data.csv")
test=read.csv("wk3_kc_house_test_data.csv")
valid=read.csv("wk3_kc_house_valid_data.csv")
shuffle=read.csv("wk3_kc_house_train_valid_shuffled.csv")
set1=read.csv("wk3_kc_house_set_1_data.csv")
set2=read.csv("wk3_kc_house_set_2_data.csv")
set3=read.csv("wk3_kc_house_set_3_data.csv")
set4=read.csv("wk3_kc_house_set_4_data.csv")

2. Define functions**

# Define a function to add features
polyfeature <- function(degree,feature){
# degree is the max power; feature is a column in a dataframe.
  if(floor(degree)!=degree){
     print("The degree you input is not a qualified integer!???T_T")
  }
  else {
       array = as.data.frame(   matrix(0,length(feature),degree)   )
       for(i in c(1:degree)){
           array[,i]= feature^i
           colnames(array)[i]=paste('power',i)
       } 
       }
  return(array)
}
# Define a function to normalize features
normal <- function(feature){
max=max(feature)
min=min(feature)
feature=(feature-min)/(max-min)
}

3. Scatter plot: price vs sqft(the dat)

dat=arrange(dat,sqft_living)
pic=ggplot(dat,aes(sqft_living,price))+geom_point(pch=5,col='lightblue')
pic=pic+ggtitle('Housing prices vs square feet')+xlab('square feet')
pic

4. Add penalty to 15th polynomial model

#  Total cost=RSS + L2*T(W)*W
# TINU REGULARIZATION
L2 = 1.5e-5
X=as.matrix( polyfeature(15,dat$sqft_living) )
model=lm.ridge(dat$price~X,lambda = L2)
round(coef(model)[2],2)
## Xpower 1 
##   136.66

5. Biulding 15th polynomial model among set1-4

# Generally, whenever we see weights change so much in response to change in data, we believe the variance of our estimate to be large.
# Comparison between small and large penalty
# Biulding 15th polynomial model among set1-4
for(L in c(1e-9,1.23e2)){
set1=arrange(set1,sqft_living)
set2=arrange(set2,sqft_living)
set3=arrange(set3,sqft_living)
set4=arrange(set4,sqft_living)
X1=as.matrix(polyfeature(15,set1$sqft_living))
X2=as.matrix(polyfeature(15,set2$sqft_living))
X3=as.matrix(polyfeature(15,set3$sqft_living))
X4=as.matrix(polyfeature(15,set4$sqft_living))
fit1=lm.ridge(set1$price~X1,lambda = L)
fit2=lm.ridge(set2$price~X2,lambda = L)
fit3=lm.ridge(set3$price~X3,lambda = L)
fit4=lm.ridge(set4$price~X4,lambda = L)
Yhat1=cbind(1,X1) %*% coef(fit1)  ;  Yhat2=cbind(1,X2) %*% coef(fit2)
Yhat3=cbind(1,X3) %*% coef(fit3)  ;  Yhat4=cbind(1,X4) %*% coef(fit4)

g1=ggplot(set1)+geom_point(aes(sqft_living,set1$price),color='lightblue',pch=5)
g1=g1+geom_point(aes(set1$sqft_living,Yhat1),color='pink',pch=3)
g1=g1+ggtitle("Subset 1")+ylab(paste('penalty',L))

g2=ggplot(set2)+geom_point(aes(sqft_living,set2$price),color='lightblue',pch=5)
g2=g2+geom_point(aes(X2[,1],Yhat2),color='pink',pch=3)
g2=g2+ggtitle("Subset 2")

g3=ggplot(set3)+geom_point(aes(sqft_living,set3$price),color='lightblue',pch=5)
g3=g3+geom_point(aes(X3[,1],Yhat3),color='pink',pch=3)
g3=g3+ggtitle("Subset 3")

g4=ggplot(set4)+geom_point(aes(sqft_living,set4$price),color='lightblue',pch=5)
g4=g4+geom_point(aes(X4[,1],Yhat4),color='pink',pch=3)
g4=g4+ggtitle("Subet 4")

t=round( data.frame(coef(fit1)[2],coef(fit2)[2], coef(fit3)[2],coef(fit4)[2]),2)
rownames(t)='Coefficients of SquareFeet'
colnames(t)=c('fit1','fit2','fit3','fit4')
grid.arrange(g1,g2,g3,g4,ncol=4)
table=data.frame(Lamda=format(L,scientific=T),min=min(t),max=max(t))
print(kable(t))
print(kable(table))
}

## 
## 
##                                 fit1       fit2      fit3      fit4
## ---------------------------  -------  ---------  --------  --------
## Coefficients of SquareFeet    797.27   -4235.44   2944.53   -766.61
## 
## 
## Lamda         min       max
## ------  ---------  --------
## 1e-09    -4235.44   2944.53

## 
## 
##                                 fit1    fit2    fit3    fit4
## ---------------------------  -------  ------  ------  ------
## Coefficients of SquareFeet    111.68   76.95   89.92   79.66
## 
## 
## Lamda         min      max
## ---------  ------  -------
## 1.23e+02    76.95   111.68

6. How to select a penalty?

# Step1 K-fold segments
# segment i starts at index (n*i/k) and ends at (n*(i+1)/k)-1.
n=dim(shuffle)[1]
k=10
kfold= matrix(0, 3,k) 
for (i in c(1:k)){
  start=floor(n*(i-1)/k+1)
  end=floor(n*(i)/k)
  kfold[,i]=rbind(i,start,end)
}
kfold
##      [,1] [,2] [,3] [,4] [,5]  [,6]  [,7]  [,8]  [,9] [,10]
## [1,]    1    2    3    4    5     6     7     8     9    10
## [2,]    1 1940 3880 5819 7759  9699 11638 13578 15517 17457
## [3,] 1939 3879 5818 7758 9698 11637 13577 15516 17456 19396
# Step2 Cross validation among K FOLDS
RSS=matrix(0,1,k)
exp=seq(3,9,0.5)
Lamda=10^exp
CVrss=matrix(0,1,length(Lamda))
for( j in c(1:length(Lamda)) ){
  L=Lamda[j]
  for (i in c(1:k)){
    valid=shuffle[c(kfold[2,i]:kfold[3,i]),]
    valid=arrange(valid,sqft_living)
    train=shuffle[-c(kfold[2,i]:kfold[3,i]),]
    train=arrange(train,sqft_living)
    X=as.matrix( polyfeature(15,train$sqft_living) )
    Y=train$price  ;  fit=lm.ridge(Y~X,lambda = L)
    Xvalid= as.matrix( polyfeature(15,valid$sqft_living) )
    Yhat=cbind(1,Xvalid) %*% coef(fit) 
    RSS[i]=sum((valid$price-Yhat)^2)
  }
  CVrss[,j]=mean(RSS)
}
CVrss=format(CVrss)
which(CVrss==min(CVrss))
## [1] 1
primeL=Lamda[which(CVrss==min(CVrss))]
primeL
## [1] 1000

7. Use the lamda to train our training set

train=arrange(train,sqft_living)
x=train$sqft_living
X=as.matrix( polyfeature(15,x) )
Y=train$price  
fit=lm.ridge(Y~X,lambda = primeL)
Xtest=as.matrix( polyfeature(15,test$sqft_living))
Yhat=cbind(1,Xtest) %*% coef(fit) 
RSS=sum((test$price-Yhat)^2)
RSS
## [1] 1.371019e+14