R Demo
data(anscombe)
#View(anscombe)
plot(y1 ~ x1, data = anscombe)
fit <- lm(y1 ~ x1, data = anscombe)
fit
##
## Call:
## lm(formula = y1 ~ x1, data = anscombe)
##
## Coefficients:
## (Intercept) x1
## 3.0001 0.5001
plot(y1 ~ x1, data = anscombe)
abline(fit, col = "red")

R Basic
RRP <- 35.99
Exchange <- 31.74
NTD <- RRP * Exchange
NTD
## [1] 1142.323
Vector
height <- 175
height2 <- 180
heights <- c(180,177,169)
heights[3]
## [1] 169
name <- c('Toby', 'John', 'Marry')
heights <- heights / 100
names(heights) <- name
heights
## Toby John Marry
## 1.80 1.77 1.69
height_vec <- c(180,169,173)
weight_vec <- c(73, 87, 43)
names_vec <- c('Brian', 'Toby', 'Sherry')
#(weight / meter ^ 2)
bmi_vec <- weight_vec / (( height_vec/ 100) ^ 2)
names(bmi_vec) <- names_vec
bmi_vec[bmi_vec < 18.5 | bmi_vec >= 24]
## Toby Sherry
## 30.46112 14.36734
Matrix
kevin <-c(85,73)
marry <-c(72,64)
jerry <-c(59,66)
c(kevin, marry, jerry)
## [1] 85 73 72 64 59 66
mat <- matrix( c(kevin, marry, jerry), nrow=3, byrow=TRUE)
mat[,1] * 0.4 + mat[,2] * 0.6
## [1] 77.8 67.2 63.2
Factor
#Data
## Qualitative
## Factor
## Quantitative
## Discrete
## Employee
## Continuous
## height
weather <- c("sunny","rainy", "cloudy", "rainy", "cloudy")
class(weather)
## [1] "character"
weather_factor <- factor(weather)
levels(weather_factor)
## [1] "cloudy" "rainy" "sunny"
List
c('James', 180, TRUE)
## [1] "James" "180" "TRUE"
person <- list(name='James', height=180, Employ=TRUE)
person
## $name
## [1] "James"
##
## $height
## [1] 180
##
## $Employ
## [1] TRUE
person$name
## [1] "James"
li <- list(c(98,82,66,54), c(83,72,77))
lapply(li, sum)
## [[1]]
## [1] 300
##
## [[2]]
## [1] 232
Data Frame
data(iris)
View(iris)
class(iris)
## [1] "data.frame"
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
tail(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 145 6.7 3.3 5.7 2.5 virginica
## 146 6.7 3.0 5.2 2.3 virginica
## 147 6.3 2.5 5.0 1.9 virginica
## 148 6.5 3.0 5.2 2.0 virginica
## 149 6.2 3.4 5.4 2.3 virginica
## 150 5.9 3.0 5.1 1.8 virginica
?head
## starting httpd help server ... done
head(iris, 10)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
iris[ 1 , ]
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
iris[ 1 , 3]
## [1] 1.4
head(iris[ , 1])
## [1] 5.1 4.9 4.7 4.6 5.0 5.4
head(iris[ , 'Sepal.Length'])
## [1] 5.1 4.9 4.7 4.6 5.0 5.4
head(iris[ c(1,2,3), ])
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
head(iris[ 1:100, ])
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
head(iris[ , c(1, 2) ])
## Sepal.Length Sepal.Width
## 1 5.1 3.5
## 2 4.9 3.0
## 3 4.7 3.2
## 4 4.6 3.1
## 5 5.0 3.6
## 6 5.4 3.9
head(iris[ , c('Sepal.Length', 'Sepal.Width')])
## Sepal.Length Sepal.Width
## 1 5.1 3.5
## 2 4.9 3.0
## 3 4.7 3.2
## 4 4.6 3.1
## 5 5.0 3.6
## 6 5.4 3.9
head(iris$Sepal.Length)
## [1] 5.1 4.9 4.7 4.6 5.0 5.4
head(iris[ 1:5 , c('Sepal.Length', 'Sepal.Width')])
## Sepal.Length Sepal.Width
## 1 5.1 3.5
## 2 4.9 3.0
## 3 4.7 3.2
## 4 4.6 3.1
## 5 5.0 3.6
head(iris[iris$Species == 'setosa' & iris$Sepal.Length >= 5, ])
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 8 5.0 3.4 1.5 0.2 setosa
## 11 5.4 3.7 1.5 0.2 setosa
## 15 5.8 4.0 1.2 0.2 setosa
heights <- c(172,180,188,167,175)
sort(heights)
## [1] 167 172 175 180 188
sort(heights, decreasing = TRUE)
## [1] 188 180 175 172 167
head(sort(iris$Sepal.Length, decreasing = TRUE))
## [1] 7.9 7.7 7.7 7.7 7.7 7.6
order(heights)
## [1] 4 1 5 2 3
heights[order(heights)]
## [1] 167 172 175 180 188
heights[c(3,4,5,1,2)]
## [1] 188 167 175 172 180
head(iris[order(iris$Sepal.Length, decreasing = TRUE),])
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 132 7.9 3.8 6.4 2.0 virginica
## 118 7.7 3.8 6.7 2.2 virginica
## 119 7.7 2.6 6.9 2.3 virginica
## 123 7.7 2.8 6.7 2.0 virginica
## 136 7.7 3.0 6.1 2.3 virginica
## 106 7.6 3.0 6.6 2.1 virginica
tb <- table(iris$Species)
pie(tb)

barplot(tb, col= "blue")

hist(iris$Sepal.Length)

boxplot(iris$Sepal.Length)

boxplot(Petal.Length ~ Species, data = iris)

plot(Petal.Width ~ Petal.Length, data = iris, col=iris$Species)

實價登錄探索
#download.file('https://raw.githubusercontent.com/ywchiu/feibr/master/data/lvr_prices.csv', 'lvr_prices.csv')
#getwd()
library(readr)
lvr_prices <- read_csv("C:/Users/Administrator/Desktop/lvr_prices.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_character(),
## X1 = col_integer(),
## land_sqmeter = col_double(),
## trading_ymd = col_date(format = ""),
## finish_ymd = col_date(format = ""),
## building_sqmeter = col_double(),
## room = col_integer(),
## living_room = col_integer(),
## bath = col_integer(),
## total_price = col_integer(),
## price_per_sqmeter = col_double(),
## parking_sqmeter = col_double(),
## parking_price = col_integer()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 32 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual expected <int> <chr> <chr> <chr> actual 1 1282 total_price an integer 6700000000 file 2 2243 total_price an integer 3882685600 row 3 2244 total_price an integer 3373314400 col 4 4629 total_price an integer 3050000000 expected 5 5890 total_price an integer 3133800000 actual # ... with 1 more variables: file <chr>
## ... ................. ... ......................................... ........ ......................................... ...... ......................................... .... ......................................... ... ......................................... ... ......................................... ........ ......................................... ...... .......................................
## See problems(...) for more details.
#View(lvr_prices)
class(lvr_prices)
## [1] "tbl_df" "tbl" "data.frame"
daan <- lvr_prices[ lvr_prices$area == '大安區' , ]
a <- c(1,2,3)
sum(a)
## [1] 6
a <- c(1,2,3, NA)
sum(a)
## [1] NA
?sum
sum(a, na.rm=TRUE)
## [1] 6
mean(daan$total_price, na.rm=TRUE)
## [1] 29798170
median(daan$total_price, na.rm=TRUE)
## [1] 20000000
summary(daan$total_price, na.rm=TRUE)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000e+00 9.498e+06 2.000e+07 2.980e+07 3.480e+07 1.870e+09 4
zhongshan <- lvr_prices[ lvr_prices$area == '中山區' , c('address', 'total_price') ]
idx <- order(zhongshan$total_price, decreasing = TRUE)
res <- zhongshan[idx,]
res[1:3, ]
## # A tibble: 3 x 2
## address total_price
## <chr> <int>
## 1 臺北市中山區建國北路一段138巷1~30號 1850000000
## 2 臺北市中山區南京東路三段1~30號 1400000000
## 3 中山段二小段31~60地號 1084948034
tail(res)
## # A tibble: 6 x 2
## address total_price
## <chr> <int>
## 1 中山段一小段691~720地號 0
## 2 臺北市中山區建國北路二段121~150號 NA
## 3 臺北市中山區民生東路二段121~150號 NA
## 4 吉林段四小段631~660地號 NA
## 5 金泰段91~120地號 NA
## 6 吉林段二小段751~780地號 NA
getTopThree <- function(area){
zhongshan <- lvr_prices[ lvr_prices$area == area , c('address', 'total_price') ]
idx <- order(zhongshan$total_price, decreasing = TRUE)
res <- zhongshan[idx,]
return(res[1:3, ])
}
getTopThree('大安區')
## # A tibble: 3 x 2
## address total_price
## <chr> <int>
## 1 臺北市大安區羅斯福路三段283巷4弄1~30號 1869781219
## 2 臺北市大安區忠孝東路四段241~270號 971340000
## 3 學府段三小段31~60地號 966660000
heights <- c(180,165,177,172,150)
gender <- c('M', 'F', 'M', 'M', 'F')
tapply(heights, gender, mean)
## F M
## 157.5000 176.3333
price_per_sec <- tapply(lvr_prices$total_price, lvr_prices$area, function(e) mean(e,na.rm=TRUE))
barplot(sort(price_per_sec, decreasing = TRUE), cex.axis = 0.6, cex.names = 0.6, col='blue', xlab = '區域', main = '價格長條圖')

salary <- c(70,75,78,85,100,120,160,140,180,120)
mean(salary)
## [1] 112.8
salary <- c(70,75,78,85,100,120,160,140,180,120, 1200000)
mean(salary)
## [1] 109193.5
sort(salary)
## [1] 70 75 78 85 100 120 120 140
## [9] 160 180 1200000
median(salary)
## [1] 120
(78 + 85) / 2
## [1] 81.5
quantile(salary, 0.25)
## 25%
## 81.5
(140 + 160) / 2
## [1] 150
quantile(salary, 0.75)
## 75%
## 150
150 - 81.5
## [1] 68.5
IQR(salary)
## [1] 68.5
lower_adgencet_value <- max(min(salary) , median(salary) - 1.5 * IQR(salary))
upper_adgencet_value <- min(max(salary) , median(salary) + 1.5 * IQR(salary))
boxplot(salary)

boxplot(salary[salary < 1000])

boxplot(total_price ~ area, data = lvr_prices)
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

boxplot(log(total_price) ~ area, data = lvr_prices, cex.axis = 0.5, cex.name = 0.5)
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 1 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 2 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 3 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 4 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 6 is not drawn

實價登錄分析
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#lvr_prices[ lvr_prices$area == '中山區' , c('address', 'total_price') ]
#filter(lvr_prices, area == '中山區')
#select(lvr_prices, address, total_price)
sum(tail(head(iris[iris$Species == 'setosa',])$Sepal.Length, 3))
## [1] 15
iris[iris$Species=='setosa',] %>% head() %>% tail(3) %>% .$Sepal.Length %>% sum()
## [1] 15
lvr_prices %>%
select(area, total_price) %>%
filter(area == '中山區') %>%
head()
## # A tibble: 6 x 2
## area total_price
## <chr> <int>
## 1 中山區 5960000
## 2 中山區 20200000
## 3 中山區 4050000
## 4 中山區 1900000
## 5 中山區 14800000
## 6 中山區 10200000
lvr_prices %>%
select(area, total_price) %>%
filter(area == '中山區') %>%
arrange(total_price) %>%
head()
## # A tibble: 6 x 2
## area total_price
## <chr> <int>
## 1 中山區 0
## 2 中山區 0
## 3 中山區 10860
## 4 中山區 16000
## 5 中山區 18060
## 6 中山區 21244
lvr_prices %>%
select(area, total_price) %>%
filter(area == '中山區') %>%
arrange(desc(total_price)) %>%
head()
## # A tibble: 6 x 2
## area total_price
## <chr> <int>
## 1 中山區 1850000000
## 2 中山區 1400000000
## 3 中山區 1084948034
## 4 中山區 1011136500
## 5 中山區 952875000
## 6 中山區 903865500
lvr_prices$trading_ym <- as.Date(format(lvr_prices$trading_ymd, '%Y-%m-01'))
lvr_stat <- lvr_prices %>%
select(area, trading_ym, total_price) %>%
filter(trading_ym >= '2012-01-01') %>%
group_by(area, trading_ym) %>%
summarise(overall_price = sum(as.numeric(total_price), na.rm=TRUE))
lvr_stat$area <- as.factor(lvr_stat$area)
par(mfrow=c(3,4))
for (a in levels(lvr_stat$area)){
zhongshan <- lvr_stat %>% filter(area==a)
plot(overall_price ~ trading_ym, data = zhongshan, type = 'l', main = a)
}

library(tidyr)
price_pivot <- spread(lvr_stat, trading_ym, overall_price, fill=0)
#price_pivot
write.csv(price_pivot, 'taipei_house_price.csv')
分類
#install.packages('rpart')
library(rpart)
download.file('https://raw.githubusercontent.com/ywchiu/feibr/master/data/Training50.csv', 'Training50.csv')
trainset <- read.csv('Training50.csv')
View(trainset)
fit <- rpart(Creditability ~ Account.Balance + Duration.of.Credit..month. + Payment.Status.of.Previous.Credit + Purpose+Credit.Amount+Value.Savings.Stocks+Length.of.current.employment+Instalment.per.cent+Sex...Marital.Status+Guarantors+Duration.in.Current.address+Most.valuable.available.asset+Age..years.+Concurrent.Credits+Type.of.apartment+No.of.Credits.at.this.Bank+Occupation+No.of.dependents+Telephone, data = trainset, method='class')
plot(fit, margin=0.1)
text(fit)

download.file('https://raw.githubusercontent.com/ywchiu/feibr/master/data/Test50.csv', 'Test50.csv')
testset <- read.csv('Test50.csv')
predicted <- predict(fit, testset, type= 'class')
table(testset$Creditability, predicted)
## predicted
## 0 1
## 0 64 93
## 1 52 291
(291 + 64) / (93 + 52 + 291 + 64)
## [1] 0.71
#install.packages('randomForest')
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
trainset$Creditability <- as.factor(trainset$Creditability)
trainset$X = NULL
forest <-randomForest(Creditability ~., data=trainset, ntree=200, importance=T, proximity=T)
forest.predicted<-predict(forest, testset, type ="class")
table(testset$Creditability,forest.predicted)
## forest.predicted
## 0 1
## 0 48 109
## 1 23 320
(315 + 46) / (315 + 46 + 111 +28)
## [1] 0.722
#install.packages('ROCR')
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
# 決策樹
predictions1 <-predict(fit, testset, type="prob")
pred.to.roc1 <-predictions1[, 2]
pred.rocr1 <-prediction(pred.to.roc1, as.factor(testset$Creditability))
perf.rocr1 <-performance(pred.rocr1, measure ="auc", x.measure="cutoff")
perf.tpr.rocr1 <-performance(pred.rocr1, "tpr","fpr")
# 隨機森林
predictions2 <-predict(forest, testset, type="prob")
pred.to.roc2 <-predictions2[, 2]
pred.rocr2 <-prediction(pred.to.roc2, as.factor(testset$Creditability))
perf.rocr2 <-performance(pred.rocr2, measure ="auc", x.measure="cutoff")
perf.tpr.rocr2 <-performance(pred.rocr2, "tpr","fpr")
plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart', 'randomforest'), 1:2)
plot(perf.tpr.rocr2, col=2, add=TRUE)

Regression
download.file('https://raw.githubusercontent.com/ywchiu/feibr/master/data/house-prices.csv', 'house-prices.csv')
house_prices <- read.csv('house-prices.csv')
View(house_prices)
house_prices$brick_d<-ifelse(house_prices$Brick=="Yes",1,0)
house_prices$east<-ifelse(house_prices$Neighborhood=="East",1,0)
house_prices$north<-ifelse(house_prices$Neighborhood=="North",1,0)
set.seed(110)
sub<-sample(nrow(house_prices), floor(nrow(house_prices)*0.6))
training_data<-house_prices[sub,]
validation_data<-house_prices[-sub,]
lm.fit1 <-lm(Price ~SqFt+Bathrooms+Bedrooms+Offers+north+east+brick_d, data=training_data)
summary(lm.fit1)
##
## Call:
## lm(formula = Price ~ SqFt + Bathrooms + Bedrooms + Offers + north +
## east + brick_d, data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28809.1 -5439.8 -251.1 5716.9 26720.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33263.403 13018.025 2.555 0.0129 *
## SqFt 48.918 7.103 6.887 2.26e-09 ***
## Bathrooms 4886.975 2746.714 1.779 0.0797 .
## Bedrooms 4352.011 1971.204 2.208 0.0306 *
## Offers -5655.299 1314.227 -4.303 5.52e-05 ***
## north -23296.029 3891.067 -5.987 8.96e-08 ***
## east -22978.967 3063.771 -7.500 1.77e-10 ***
## brick_d 18500.732 2379.993 7.773 5.65e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9244 on 68 degrees of freedom
## Multiple R-squared: 0.8864, Adjusted R-squared: 0.8747
## F-statistic: 75.77 on 7 and 68 DF, p-value: < 2.2e-16
lm.fit1.step <-step(lm.fit1)
## Start: AIC=1395.57
## Price ~ SqFt + Bathrooms + Bedrooms + Offers + north + east +
## brick_d
##
## Df Sum of Sq RSS AIC
## <none> 5.8106e+09 1395.6
## - Bathrooms 1 270499345 6.0811e+09 1397.0
## - Bedrooms 1 416513849 6.2271e+09 1398.8
## - Offers 1 1582279549 7.3929e+09 1411.9
## - north 1 3062944430 8.8736e+09 1425.8
## - SqFt 1 4053393477 9.8640e+09 1433.8
## - east 1 4806858174 1.0617e+10 1439.4
## - brick_d 1 5163439960 1.0974e+10 1441.9
summary(lm.fit1.step)
##
## Call:
## lm(formula = Price ~ SqFt + Bathrooms + Bedrooms + Offers + north +
## east + brick_d, data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -28809.1 -5439.8 -251.1 5716.9 26720.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33263.403 13018.025 2.555 0.0129 *
## SqFt 48.918 7.103 6.887 2.26e-09 ***
## Bathrooms 4886.975 2746.714 1.779 0.0797 .
## Bedrooms 4352.011 1971.204 2.208 0.0306 *
## Offers -5655.299 1314.227 -4.303 5.52e-05 ***
## north -23296.029 3891.067 -5.987 8.96e-08 ***
## east -22978.967 3063.771 -7.500 1.77e-10 ***
## brick_d 18500.732 2379.993 7.773 5.65e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9244 on 68 degrees of freedom
## Multiple R-squared: 0.8864, Adjusted R-squared: 0.8747
## F-statistic: 75.77 on 7 and 68 DF, p-value: < 2.2e-16
training_data$predict.price<-predict(lm.fit1)
training_data$error<-residuals(lm.fit1)
validation_data$predict.price<-predict(lm.fit1,newdata=validation_data)
validation_data$error<-validation_data$predict.price-validation_data$Price
hist(training_data$error)

hist(validation_data$error)

a<-cor(training_data$Price,training_data$predict.price)
b<-cor(validation_data$Price,validation_data$predict.price)
a*a
## [1] 0.8863606
b*b
## [1] 0.840097