This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

library(knitr)
library(boot)

########### LOAD the Dataset. Combine them into one data frame. 

ex3x <- as.matrix(read.table('C:/temp/ex3x.dat'))

colnames(ex3x)<-c("sqf","bedrooms")

ex3y <- as.matrix(read.table('C:/temp/ex3y.dat'))

colnames(ex3y)<-c("price")

print(ex3x)
##        sqf bedrooms
##  [1,] 2104        3
##  [2,] 1600        3
##  [3,] 2400        3
##  [4,] 1416        2
##  [5,] 3000        4
##  [6,] 1985        4
##  [7,] 1534        3
##  [8,] 1427        3
##  [9,] 1380        3
## [10,] 1494        3
## [11,] 1940        4
## [12,] 2000        3
## [13,] 1890        3
## [14,] 4478        5
## [15,] 1268        3
## [16,] 2300        4
## [17,] 1320        2
## [18,] 1236        3
## [19,] 2609        4
## [20,] 3031        4
## [21,] 1767        3
## [22,] 1888        2
## [23,] 1604        3
## [24,] 1962        4
## [25,] 3890        3
## [26,] 1100        3
## [27,] 1458        3
## [28,] 2526        3
## [29,] 2200        3
## [30,] 2637        3
## [31,] 1839        2
## [32,] 1000        1
## [33,] 2040        4
## [34,] 3137        3
## [35,] 1811        4
## [36,] 1437        3
## [37,] 1239        3
## [38,] 2132        4
## [39,] 4215        4
## [40,] 2162        4
## [41,] 1664        2
## [42,] 2238        3
## [43,] 2567        4
## [44,] 1200        3
## [45,]  852        2
## [46,] 1852        4
## [47,] 1203        3
print(ex3y)
##        price
##  [1,] 399900
##  [2,] 329900
##  [3,] 369000
##  [4,] 232000
##  [5,] 539900
##  [6,] 299900
##  [7,] 314900
##  [8,] 198999
##  [9,] 212000
## [10,] 242500
## [11,] 239999
## [12,] 347000
## [13,] 329999
## [14,] 699900
## [15,] 259900
## [16,] 449900
## [17,] 299900
## [18,] 199900
## [19,] 499998
## [20,] 599000
## [21,] 252900
## [22,] 255000
## [23,] 242900
## [24,] 259900
## [25,] 573900
## [26,] 249900
## [27,] 464500
## [28,] 469000
## [29,] 475000
## [30,] 299900
## [31,] 349900
## [32,] 169900
## [33,] 314900
## [34,] 579900
## [35,] 285900
## [36,] 249900
## [37,] 229900
## [38,] 345000
## [39,] 549000
## [40,] 287000
## [41,] 368500
## [42,] 329900
## [43,] 314000
## [44,] 299000
## [45,] 179900
## [46,] 299900
## [47,] 239500
### with this command, we  pull out all the x values in standard format.
ex3x <- scale(ex3x)
ex3y <- scale(ex3y)

house_data <- data.frame(ex3x,ex3y)
print(house_data)
##              sqf   bedrooms       price
## 1   0.1300098691 -0.2236752  0.47574687
## 2  -0.5041898382 -0.2236752 -0.08407444
## 3   0.5024763638 -0.2236752  0.22862575
## 4  -0.7357230647 -1.5377669 -0.86702453
## 5   1.2574760154  1.0904165  1.59538948
## 6  -0.0197317285  1.0904165 -0.32399786
## 7  -0.5872397999 -0.2236752 -0.20403615
## 8  -0.7218814044 -0.2236752 -1.13094828
## 9  -0.7810230438 -0.2236752 -1.02697347
## 10 -0.6375731100 -0.2236752 -0.78305133
## 11 -0.0763567023  1.0904165 -0.80305294
## 12 -0.0008567372 -0.2236752  0.05268191
## 13 -0.1392733400 -0.2236752 -0.08328269
## 14  3.1172918237  2.4045083  2.87498104
## 15 -0.9219563121 -0.2236752 -0.64389575
## 16  0.3766430886  1.0904165  0.87561923
## 17 -0.8565230089 -1.5377669 -0.32399786
## 18 -0.9622229602 -0.2236752 -1.12374258
## 19  0.7654679091  1.0904165  1.27627534
## 20  1.2964843307  1.0904165  2.06803861
## 21 -0.2940482685 -0.2236752 -0.69987788
## 22 -0.1417900055 -1.5377669 -0.68308324
## 23 -0.4991565072 -0.2236752 -0.77985235
## 24 -0.0486733818  1.0904165 -0.64389575
## 25  2.3773921652 -0.2236752  1.86730269
## 26 -1.1333562145 -0.2236752 -0.72387022
## 27 -0.6828730891 -0.2236752  0.99238196
## 28  0.6610262907 -0.2236752  1.02837047
## 29  0.2508098133 -0.2236752  1.07635515
## 30  0.8007012262 -0.2236752 -0.32399786
## 31 -0.2034483104 -1.5377669  0.07587450
## 32 -1.2591894898 -2.8518586 -1.36366600
## 33  0.0494765729  1.0904165 -0.20403615
## 34  1.4298676025 -0.2236752  1.91528737
## 35 -0.2386816274  1.0904165 -0.43596212
## 36 -0.7092980769 -0.2236752 -0.72387022
## 37 -0.9584479619 -0.2236752 -0.88381916
## 38  0.1652431861  1.0904165  0.03668701
## 39  2.7863503098  1.0904165  1.66816625
## 40  0.2029931687  1.0904165 -0.42716493
## 41 -0.4236565421 -1.5377669  0.22462702
## 42  0.2986264579 -0.2236752 -0.08407444
## 43  0.7126179335  1.0904165 -0.21123385
## 44 -1.0075229393 -0.2236752 -0.33119556
## 45 -1.4454227371 -1.5377669 -1.28369153
## 46 -0.1870899846  1.0904165 -0.32399786
## 47 -1.0037479410 -0.2236752 -0.80704367
pairs(~.,data=house_data, main="Scatterplot Matrix of House Prices Data by Muthukumar Srinivasan")

# Scatterplot Matrices from the glus Package 
library(gclus)
## Warning: package 'gclus' was built under R version 3.2.3
## Loading required package: cluster
dta <- house_data[c(1,2,3)] # get data 
dta.r <- abs(cor(dta)) # get correlations
dta.col <- dmat.color(dta.r) # get colors
# reorder variables so those with highest correlation
# are closest to the diagonal
dta.o <- order.single(dta.r) 
cpairs(dta, dta.o, panel.colors=dta.col, gap=.5, main="Variables Ordered and Colored by Correlation" )

####Identify top 10 data points of House of Data

head(house_data)
##           sqf   bedrooms       price
## 1  0.13000987 -0.2236752  0.47574687
## 2 -0.50418984 -0.2236752 -0.08407444
## 3  0.50247636 -0.2236752  0.22862575
## 4 -0.73572306 -1.5377669 -0.86702453
## 5  1.25747602  1.0904165  1.59538948
## 6 -0.01973173  1.0904165 -0.32399786
####Reference : http://www.r-bloggers.com/linear-regression-by-gradient-descent/

cost <- function(X, y, theta) {
  sum( (rowSums(X %*% theta) - cbind(y,y,y))^2 ) / (2*length(y))
}

regression_ld <- function(x,y,alpha,num_iters){
  # initialize coefficients
  theta <- matrix(c(0,0,0,0,0,0,0,0,0), nrow=3)

  # add a column of 1's for the intercept coefficient 
  X <- as.matrix(cbind(1, as.matrix(x)))

  cost_history <- double(num_iters)
  # gradient descent
  for (i in 1:num_iters) {
    error <- as.matrix((as.matrix(X)%*%as.matrix(theta) - cbind(y,y,y)))
    delta <- t(X) %*% error / length(y)
    theta <- theta - alpha * delta
    cost_history[i] <- cost(X, y, theta[,1])
  }

  theta <- theta[,1]
  print(theta)

  plot(cost_history, type='line', col='blue', lwd=2, main=paste('Cost function: alpha=',alpha) , ylab='cost',xlab='Iterations')
  
  plot(cost_history, col=rgb(0.2,0.4,0.6,0.4), main='Linear regression by gradient descent')
  
  return(min(cost_history))
}

#####1000 Iterations

regression_ld(ex3x,ex3y,0.01,1000)
##                         sqf      bedrooms 
## -1.109337e-16  8.785037e-01 -4.691666e-02
## Warning in plot.xy(xy, type, ...): plot type 'line' will be truncated to
## first character

## [1] 0.3921101
#####10000 Iterations

regression_ld(ex3x,ex3y,0.01,10000)
##                         sqf      bedrooms 
## -1.109514e-16  8.847660e-01 -5.317882e-02
## Warning in plot.xy(xy, type, ...): plot type 'line' will be truncated to
## first character

## [1] 0.3920594
##### Multiple graphs

regression_ld(ex3x,ex3y,1,1000)
##                         sqf      bedrooms 
## -8.858162e-17  8.847660e-01 -5.317882e-02
## Warning in plot.xy(xy, type, ...): plot type 'line' will be truncated to
## first character

## [1] 0.3920594
regression_ld(ex3x,ex3y,0.001,1000)
##                         sqf      bedrooms 
## -7.825655e-17  4.897084e-01  1.614397e-01
## Warning in plot.xy(xy, type, ...): plot type 'line' will be truncated to
## first character

## [1] 0.5494031
#Identify the Linear regression

house_data.glm <- glm(price ~ sqf+bedrooms, data=house_data)
summary(house_data.glm)
## 
## Call:
## glm(formula = price ~ sqf + bedrooms, data = house_data)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.04433  -0.34898  -0.08661   0.34947   1.58467  
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.309e-16  7.707e-02   0.000    1.000    
## sqf          8.848e-01  9.403e-02   9.409 4.22e-12 ***
## bedrooms    -5.318e-02  9.403e-02  -0.566    0.575    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.2791938)
## 
##     Null deviance: 46.000  on 46  degrees of freedom
## Residual deviance: 12.285  on 44  degrees of freedom
## AIC: 78.315
## 
## Number of Fisher Scoring iterations: 2
###Identify the Least Squares

A <- cbind(1,ex3x)
B <- ex3y
A_T <- t(A)
A_TA <- A_T %*% A
A_TB <- A_T %*% B
A_TA_inv <- solve(A_TA)
x_hat <- A_TA_inv %*% A_TB
x_hat
##                  price
##          -9.256349e-17
## sqf       8.847660e-01
## bedrooms -5.317882e-02
#### 5-FOLD Cross validation method through Programming R
library(boot)
# 5-fold cross validation
(cv.err.5 <- cv.glm(house_data, house_data.glm, K = 5)$delta)
## [1] 0.2830680 0.2806415
# Test Error prediction
mu_hat <- fitted(house_data.glm)
house_data.diag <- glm.diag(house_data.glm)
(cv.err <- mean((house_data.glm$y - mu_hat)^2/(1 - house_data.diag$h)^2))
## [1] 0.2972694
####COMPARISON WITH OTHER LINEAR MODEL
#####In order to interpret the delta for this linear model, we need to compare it against some other model. linear model compares against a various degrees of polynomial models (1-3):

do_auto_poly_regression <- function(data,power){
  glm.fit <- glm(price~poly(sqf+bedrooms,power), data=data)  
  return(cv.glm(data,glm.fit,K=5)$delta[1])
}

degree<-c(1:3)
cv_err <- sapply(degree,  function(x) do_auto_poly_regression(house_data, x))
degree <- c(0,degree)
cv_err <- c(cv.err.5[1],cv_err)
cv_err
## [1] 0.2830680 0.4881431 0.4716946 0.4715987
plot(degree,cv_err,type='b'
     ,main="Cross Validation of Linear and Polynomial Models of house_data"
     ,xlab="Polynomial Degree"
     ,ylab="Cross Validation Error")

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.