This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
library(knitr)
library(boot)
########### LOAD the Dataset. Combine them into one data frame.
ex3x <- as.matrix(read.table('C:/temp/ex3x.dat'))
colnames(ex3x)<-c("sqf","bedrooms")
ex3y <- as.matrix(read.table('C:/temp/ex3y.dat'))
colnames(ex3y)<-c("price")
print(ex3x)
## sqf bedrooms
## [1,] 2104 3
## [2,] 1600 3
## [3,] 2400 3
## [4,] 1416 2
## [5,] 3000 4
## [6,] 1985 4
## [7,] 1534 3
## [8,] 1427 3
## [9,] 1380 3
## [10,] 1494 3
## [11,] 1940 4
## [12,] 2000 3
## [13,] 1890 3
## [14,] 4478 5
## [15,] 1268 3
## [16,] 2300 4
## [17,] 1320 2
## [18,] 1236 3
## [19,] 2609 4
## [20,] 3031 4
## [21,] 1767 3
## [22,] 1888 2
## [23,] 1604 3
## [24,] 1962 4
## [25,] 3890 3
## [26,] 1100 3
## [27,] 1458 3
## [28,] 2526 3
## [29,] 2200 3
## [30,] 2637 3
## [31,] 1839 2
## [32,] 1000 1
## [33,] 2040 4
## [34,] 3137 3
## [35,] 1811 4
## [36,] 1437 3
## [37,] 1239 3
## [38,] 2132 4
## [39,] 4215 4
## [40,] 2162 4
## [41,] 1664 2
## [42,] 2238 3
## [43,] 2567 4
## [44,] 1200 3
## [45,] 852 2
## [46,] 1852 4
## [47,] 1203 3
print(ex3y)
## price
## [1,] 399900
## [2,] 329900
## [3,] 369000
## [4,] 232000
## [5,] 539900
## [6,] 299900
## [7,] 314900
## [8,] 198999
## [9,] 212000
## [10,] 242500
## [11,] 239999
## [12,] 347000
## [13,] 329999
## [14,] 699900
## [15,] 259900
## [16,] 449900
## [17,] 299900
## [18,] 199900
## [19,] 499998
## [20,] 599000
## [21,] 252900
## [22,] 255000
## [23,] 242900
## [24,] 259900
## [25,] 573900
## [26,] 249900
## [27,] 464500
## [28,] 469000
## [29,] 475000
## [30,] 299900
## [31,] 349900
## [32,] 169900
## [33,] 314900
## [34,] 579900
## [35,] 285900
## [36,] 249900
## [37,] 229900
## [38,] 345000
## [39,] 549000
## [40,] 287000
## [41,] 368500
## [42,] 329900
## [43,] 314000
## [44,] 299000
## [45,] 179900
## [46,] 299900
## [47,] 239500
### with this command, we pull out all the x values in standard format.
ex3x <- scale(ex3x)
ex3y <- scale(ex3y)
house_data <- data.frame(ex3x,ex3y)
print(house_data)
## sqf bedrooms price
## 1 0.1300098691 -0.2236752 0.47574687
## 2 -0.5041898382 -0.2236752 -0.08407444
## 3 0.5024763638 -0.2236752 0.22862575
## 4 -0.7357230647 -1.5377669 -0.86702453
## 5 1.2574760154 1.0904165 1.59538948
## 6 -0.0197317285 1.0904165 -0.32399786
## 7 -0.5872397999 -0.2236752 -0.20403615
## 8 -0.7218814044 -0.2236752 -1.13094828
## 9 -0.7810230438 -0.2236752 -1.02697347
## 10 -0.6375731100 -0.2236752 -0.78305133
## 11 -0.0763567023 1.0904165 -0.80305294
## 12 -0.0008567372 -0.2236752 0.05268191
## 13 -0.1392733400 -0.2236752 -0.08328269
## 14 3.1172918237 2.4045083 2.87498104
## 15 -0.9219563121 -0.2236752 -0.64389575
## 16 0.3766430886 1.0904165 0.87561923
## 17 -0.8565230089 -1.5377669 -0.32399786
## 18 -0.9622229602 -0.2236752 -1.12374258
## 19 0.7654679091 1.0904165 1.27627534
## 20 1.2964843307 1.0904165 2.06803861
## 21 -0.2940482685 -0.2236752 -0.69987788
## 22 -0.1417900055 -1.5377669 -0.68308324
## 23 -0.4991565072 -0.2236752 -0.77985235
## 24 -0.0486733818 1.0904165 -0.64389575
## 25 2.3773921652 -0.2236752 1.86730269
## 26 -1.1333562145 -0.2236752 -0.72387022
## 27 -0.6828730891 -0.2236752 0.99238196
## 28 0.6610262907 -0.2236752 1.02837047
## 29 0.2508098133 -0.2236752 1.07635515
## 30 0.8007012262 -0.2236752 -0.32399786
## 31 -0.2034483104 -1.5377669 0.07587450
## 32 -1.2591894898 -2.8518586 -1.36366600
## 33 0.0494765729 1.0904165 -0.20403615
## 34 1.4298676025 -0.2236752 1.91528737
## 35 -0.2386816274 1.0904165 -0.43596212
## 36 -0.7092980769 -0.2236752 -0.72387022
## 37 -0.9584479619 -0.2236752 -0.88381916
## 38 0.1652431861 1.0904165 0.03668701
## 39 2.7863503098 1.0904165 1.66816625
## 40 0.2029931687 1.0904165 -0.42716493
## 41 -0.4236565421 -1.5377669 0.22462702
## 42 0.2986264579 -0.2236752 -0.08407444
## 43 0.7126179335 1.0904165 -0.21123385
## 44 -1.0075229393 -0.2236752 -0.33119556
## 45 -1.4454227371 -1.5377669 -1.28369153
## 46 -0.1870899846 1.0904165 -0.32399786
## 47 -1.0037479410 -0.2236752 -0.80704367
pairs(~.,data=house_data, main="Scatterplot Matrix of House Prices Data by Muthukumar Srinivasan")
# Scatterplot Matrices from the glus Package
library(gclus)
## Warning: package 'gclus' was built under R version 3.2.3
## Loading required package: cluster
dta <- house_data[c(1,2,3)] # get data
dta.r <- abs(cor(dta)) # get correlations
dta.col <- dmat.color(dta.r) # get colors
# reorder variables so those with highest correlation
# are closest to the diagonal
dta.o <- order.single(dta.r)
cpairs(dta, dta.o, panel.colors=dta.col, gap=.5, main="Variables Ordered and Colored by Correlation" )
####Identify top 10 data points of House of Data
head(house_data)
## sqf bedrooms price
## 1 0.13000987 -0.2236752 0.47574687
## 2 -0.50418984 -0.2236752 -0.08407444
## 3 0.50247636 -0.2236752 0.22862575
## 4 -0.73572306 -1.5377669 -0.86702453
## 5 1.25747602 1.0904165 1.59538948
## 6 -0.01973173 1.0904165 -0.32399786
####Reference : http://www.r-bloggers.com/linear-regression-by-gradient-descent/
cost <- function(X, y, theta) {
sum( (rowSums(X %*% theta) - cbind(y,y,y))^2 ) / (2*length(y))
}
regression_ld <- function(x,y,alpha,num_iters){
# initialize coefficients
theta <- matrix(c(0,0,0,0,0,0,0,0,0), nrow=3)
# add a column of 1's for the intercept coefficient
X <- as.matrix(cbind(1, as.matrix(x)))
cost_history <- double(num_iters)
# gradient descent
for (i in 1:num_iters) {
error <- as.matrix((as.matrix(X)%*%as.matrix(theta) - cbind(y,y,y)))
delta <- t(X) %*% error / length(y)
theta <- theta - alpha * delta
cost_history[i] <- cost(X, y, theta[,1])
}
theta <- theta[,1]
print(theta)
plot(cost_history, type='line', col='blue', lwd=2, main=paste('Cost function: alpha=',alpha) , ylab='cost',xlab='Iterations')
plot(cost_history, col=rgb(0.2,0.4,0.6,0.4), main='Linear regression by gradient descent')
return(min(cost_history))
}
#####1000 Iterations
regression_ld(ex3x,ex3y,0.01,1000)
## sqf bedrooms
## -1.109337e-16 8.785037e-01 -4.691666e-02
## Warning in plot.xy(xy, type, ...): plot type 'line' will be truncated to
## first character
## [1] 0.3921101
#####10000 Iterations
regression_ld(ex3x,ex3y,0.01,10000)
## sqf bedrooms
## -1.109514e-16 8.847660e-01 -5.317882e-02
## Warning in plot.xy(xy, type, ...): plot type 'line' will be truncated to
## first character
## [1] 0.3920594
##### Multiple graphs
regression_ld(ex3x,ex3y,1,1000)
## sqf bedrooms
## -8.858162e-17 8.847660e-01 -5.317882e-02
## Warning in plot.xy(xy, type, ...): plot type 'line' will be truncated to
## first character
## [1] 0.3920594
regression_ld(ex3x,ex3y,0.001,1000)
## sqf bedrooms
## -7.825655e-17 4.897084e-01 1.614397e-01
## Warning in plot.xy(xy, type, ...): plot type 'line' will be truncated to
## first character
## [1] 0.5494031
#Identify the Linear regression
house_data.glm <- glm(price ~ sqf+bedrooms, data=house_data)
summary(house_data.glm)
##
## Call:
## glm(formula = price ~ sqf + bedrooms, data = house_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.04433 -0.34898 -0.08661 0.34947 1.58467
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.309e-16 7.707e-02 0.000 1.000
## sqf 8.848e-01 9.403e-02 9.409 4.22e-12 ***
## bedrooms -5.318e-02 9.403e-02 -0.566 0.575
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.2791938)
##
## Null deviance: 46.000 on 46 degrees of freedom
## Residual deviance: 12.285 on 44 degrees of freedom
## AIC: 78.315
##
## Number of Fisher Scoring iterations: 2
###Identify the Least Squares
A <- cbind(1,ex3x)
B <- ex3y
A_T <- t(A)
A_TA <- A_T %*% A
A_TB <- A_T %*% B
A_TA_inv <- solve(A_TA)
x_hat <- A_TA_inv %*% A_TB
x_hat
## price
## -9.256349e-17
## sqf 8.847660e-01
## bedrooms -5.317882e-02
#### 5-FOLD Cross validation method through Programming R
library(boot)
# 5-fold cross validation
(cv.err.5 <- cv.glm(house_data, house_data.glm, K = 5)$delta)
## [1] 0.2830680 0.2806415
# Test Error prediction
mu_hat <- fitted(house_data.glm)
house_data.diag <- glm.diag(house_data.glm)
(cv.err <- mean((house_data.glm$y - mu_hat)^2/(1 - house_data.diag$h)^2))
## [1] 0.2972694
####COMPARISON WITH OTHER LINEAR MODEL
#####In order to interpret the delta for this linear model, we need to compare it against some other model. linear model compares against a various degrees of polynomial models (1-3):
do_auto_poly_regression <- function(data,power){
glm.fit <- glm(price~poly(sqf+bedrooms,power), data=data)
return(cv.glm(data,glm.fit,K=5)$delta[1])
}
degree<-c(1:3)
cv_err <- sapply(degree, function(x) do_auto_poly_regression(house_data, x))
degree <- c(0,degree)
cv_err <- c(cv.err.5[1],cv_err)
cv_err
## [1] 0.2830680 0.4881431 0.4716946 0.4715987
plot(degree,cv_err,type='b'
,main="Cross Validation of Linear and Polynomial Models of house_data"
,xlab="Polynomial Degree"
,ylab="Cross Validation Error")
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.