R Demo

data(anscombe)
#View(anscombe)
plot(y1 ~ x1, data = anscombe)
fit <- lm(y1 ~ x1, data = anscombe)
fit
## 
## Call:
## lm(formula = y1 ~ x1, data = anscombe)
## 
## Coefficients:
## (Intercept)           x1  
##      3.0001       0.5001
plot(y1 ~ x1, data = anscombe)
abline(fit, col = "red")

R Basic

RRP <- 35.99
Exchange <- 31.74
NTD <- RRP * Exchange
NTD
## [1] 1142.323

Vector

height  <- 175
height2 <- 180

heights <- c(180,177,169)
heights[3]
## [1] 169
name <- c('Toby', 'John', 'Marry')

heights <- heights / 100

names(heights) <- name
heights
##  Toby  John Marry 
##  1.80  1.77  1.69
height_vec <- c(180,169,173)
weight_vec <- c(73, 87, 43)
names_vec  <- c('Brian', 'Toby', 'Sherry')

#(weight / meter ^ 2)
bmi_vec <- weight_vec / (( height_vec/ 100) ^ 2)
names(bmi_vec) <- names_vec

bmi_vec[bmi_vec < 18.5 | bmi_vec >= 24]
##     Toby   Sherry 
## 30.46112 14.36734

Matrix

kevin <-c(85,73)
marry <-c(72,64)
jerry <-c(59,66)

c(kevin, marry, jerry)
## [1] 85 73 72 64 59 66
mat <- matrix( c(kevin, marry, jerry), nrow=3, byrow=TRUE)

mat[,1] * 0.4 + mat[,2] * 0.6
## [1] 77.8 67.2 63.2

Factor

#Data
 ## Qualitative
    ## Factor
 ## Quantitative
    ## Discrete
        ## Employee
    ## Continuous
        ## height

weather <- c("sunny","rainy", "cloudy", "rainy", "cloudy")
class(weather)
## [1] "character"
weather_factor <- factor(weather)
levels(weather_factor)
## [1] "cloudy" "rainy"  "sunny"

List

c('James', 180, TRUE)
## [1] "James" "180"   "TRUE"
person <- list(name='James', height=180, Employ=TRUE)
person
## $name
## [1] "James"
## 
## $height
## [1] 180
## 
## $Employ
## [1] TRUE
person$name
## [1] "James"
li <- list(c(98,82,66,54), c(83,72,77))
lapply(li, sum)
## [[1]]
## [1] 300
## 
## [[2]]
## [1] 232

Data Frame

data(iris)
View(iris)
class(iris)
## [1] "data.frame"
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
head(iris)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
tail(iris)
##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 145          6.7         3.3          5.7         2.5 virginica
## 146          6.7         3.0          5.2         2.3 virginica
## 147          6.3         2.5          5.0         1.9 virginica
## 148          6.5         3.0          5.2         2.0 virginica
## 149          6.2         3.4          5.4         2.3 virginica
## 150          5.9         3.0          5.1         1.8 virginica
?head
## starting httpd help server ... done
head(iris, 10)
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 2           4.9         3.0          1.4         0.2  setosa
## 3           4.7         3.2          1.3         0.2  setosa
## 4           4.6         3.1          1.5         0.2  setosa
## 5           5.0         3.6          1.4         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 7           4.6         3.4          1.4         0.3  setosa
## 8           5.0         3.4          1.5         0.2  setosa
## 9           4.4         2.9          1.4         0.2  setosa
## 10          4.9         3.1          1.5         0.1  setosa
iris[  1   ,     ]
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
iris[  1   ,    3]
## [1] 1.4
head(iris[      ,    1])
## [1] 5.1 4.9 4.7 4.6 5.0 5.4
head(iris[      ,    'Sepal.Length'])
## [1] 5.1 4.9 4.7 4.6 5.0 5.4
head(iris[ c(1,2,3),     ])
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
head(iris[ 1:100,     ])
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
head(iris[      ,    c(1, 2) ])
##   Sepal.Length Sepal.Width
## 1          5.1         3.5
## 2          4.9         3.0
## 3          4.7         3.2
## 4          4.6         3.1
## 5          5.0         3.6
## 6          5.4         3.9
head(iris[      ,    c('Sepal.Length', 'Sepal.Width')])
##   Sepal.Length Sepal.Width
## 1          5.1         3.5
## 2          4.9         3.0
## 3          4.7         3.2
## 4          4.6         3.1
## 5          5.0         3.6
## 6          5.4         3.9
head(iris$Sepal.Length)
## [1] 5.1 4.9 4.7 4.6 5.0 5.4
head(iris[ 1:5      ,  c('Sepal.Length', 'Sepal.Width')])
##   Sepal.Length Sepal.Width
## 1          5.1         3.5
## 2          4.9         3.0
## 3          4.7         3.2
## 4          4.6         3.1
## 5          5.0         3.6
head(iris[iris$Species  == 'setosa' & iris$Sepal.Length >= 5,    ])
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 5           5.0         3.6          1.4         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 8           5.0         3.4          1.5         0.2  setosa
## 11          5.4         3.7          1.5         0.2  setosa
## 15          5.8         4.0          1.2         0.2  setosa
heights <- c(172,180,188,167,175)
sort(heights)
## [1] 167 172 175 180 188
sort(heights, decreasing = TRUE)
## [1] 188 180 175 172 167
head(sort(iris$Sepal.Length, decreasing = TRUE))
## [1] 7.9 7.7 7.7 7.7 7.7 7.6
order(heights)
## [1] 4 1 5 2 3
heights[order(heights)]
## [1] 167 172 175 180 188
heights[c(3,4,5,1,2)]
## [1] 188 167 175 172 180
head(iris[order(iris$Sepal.Length, decreasing = TRUE),])
##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 132          7.9         3.8          6.4         2.0 virginica
## 118          7.7         3.8          6.7         2.2 virginica
## 119          7.7         2.6          6.9         2.3 virginica
## 123          7.7         2.8          6.7         2.0 virginica
## 136          7.7         3.0          6.1         2.3 virginica
## 106          7.6         3.0          6.6         2.1 virginica
tb <- table(iris$Species)
pie(tb)

barplot(tb, col= "blue")

hist(iris$Sepal.Length)

boxplot(iris$Sepal.Length)

boxplot(Petal.Length ~ Species, data = iris)

plot(Petal.Width ~ Petal.Length, data = iris, col=iris$Species)

實價登錄探索

#download.file('https://raw.githubusercontent.com/ywchiu/feibr/master/data/lvr_prices.csv', 'lvr_prices.csv')


#getwd()
library(readr)
lvr_prices <- read_csv("C:/Users/Administrator/Desktop/lvr_prices.csv")
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   X1 = col_integer(),
##   land_sqmeter = col_double(),
##   trading_ymd = col_date(format = ""),
##   finish_ymd = col_date(format = ""),
##   building_sqmeter = col_double(),
##   room = col_integer(),
##   living_room = col_integer(),
##   bath = col_integer(),
##   total_price = col_integer(),
##   price_per_sqmeter = col_double(),
##   parking_sqmeter = col_double(),
##   parking_price = col_integer()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 32 parsing failures.
## row # A tibble: 5 x 5 col     row         col   expected     actual expected   <int>       <chr>      <chr>      <chr> actual 1  1282 total_price an integer 6700000000 file 2  2243 total_price an integer 3882685600 row 3  2244 total_price an integer 3373314400 col 4  4629 total_price an integer 3050000000 expected 5  5890 total_price an integer 3133800000 actual # ... with 1 more variables: file <chr>
## ... ................. ... ......................................... ........ ......................................... ...... ......................................... .... ......................................... ... ......................................... ... ......................................... ........ ......................................... ...... .......................................
## See problems(...) for more details.
#View(lvr_prices)

class(lvr_prices)
## [1] "tbl_df"     "tbl"        "data.frame"
daan <- lvr_prices[ lvr_prices$area ==  '大安區' ,   ]



a <- c(1,2,3)
sum(a)
## [1] 6
a <- c(1,2,3, NA)
sum(a)
## [1] NA
?sum
sum(a, na.rm=TRUE)
## [1] 6
mean(daan$total_price, na.rm=TRUE)
## [1] 29798170
median(daan$total_price, na.rm=TRUE)
## [1] 20000000
summary(daan$total_price, na.rm=TRUE)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's 
## 0.000e+00 9.498e+06 2.000e+07 2.980e+07 3.480e+07 1.870e+09         4
zhongshan <- lvr_prices[ lvr_prices$area == '中山區' ,  c('address', 'total_price')   ]
idx <- order(zhongshan$total_price, decreasing = TRUE)
res <- zhongshan[idx,]
res[1:3, ]
## # A tibble: 3 x 2
##                               address total_price
##                                 <chr>       <int>
## 1 臺北市中山區建國北路一段138巷1~30號  1850000000
## 2      臺北市中山區南京東路三段1~30號  1400000000
## 3               中山段二小段31~60地號  1084948034
tail(res)
## # A tibble: 6 x 2
##                             address total_price
##                               <chr>       <int>
## 1           中山段一小段691~720地號           0
## 2 臺北市中山區建國北路二段121~150號          NA
## 3 臺北市中山區民生東路二段121~150號          NA
## 4           吉林段四小段631~660地號          NA
## 5                  金泰段91~120地號          NA
## 6           吉林段二小段751~780地號          NA
getTopThree <- function(area){
  zhongshan <- lvr_prices[ lvr_prices$area == area ,  c('address', 'total_price')   ]
  idx <- order(zhongshan$total_price, decreasing = TRUE)
  res <- zhongshan[idx,]
  return(res[1:3, ])
  
}

getTopThree('大安區')
## # A tibble: 3 x 2
##                                  address total_price
##                                    <chr>       <int>
## 1 臺北市大安區羅斯福路三段283巷4弄1~30號  1869781219
## 2      臺北市大安區忠孝東路四段241~270號   971340000
## 3                  學府段三小段31~60地號   966660000
heights <- c(180,165,177,172,150)
gender  <- c('M', 'F', 'M', 'M', 'F')

tapply(heights, gender, mean)
##        F        M 
## 157.5000 176.3333
price_per_sec <- tapply(lvr_prices$total_price, lvr_prices$area, function(e) mean(e,na.rm=TRUE))


barplot(sort(price_per_sec, decreasing = TRUE), cex.axis = 0.6, cex.names = 0.6, col='blue', xlab = '區域', main = '價格長條圖')

salary <- c(70,75,78,85,100,120,160,140,180,120)
mean(salary)
## [1] 112.8
salary <- c(70,75,78,85,100,120,160,140,180,120, 1200000)
mean(salary)
## [1] 109193.5
sort(salary)
##  [1]      70      75      78      85     100     120     120     140
##  [9]     160     180 1200000
median(salary)
## [1] 120
(78 + 85)  / 2
## [1] 81.5
quantile(salary, 0.25)
##  25% 
## 81.5
(140 + 160) / 2
## [1] 150
quantile(salary, 0.75)
## 75% 
## 150
150 - 81.5
## [1] 68.5
IQR(salary)
## [1] 68.5
lower_adgencet_value <- max(min(salary) , median(salary) - 1.5 * IQR(salary))

upper_adgencet_value <- min(max(salary) , median(salary) + 1.5 * IQR(salary))

boxplot(salary)

boxplot(salary[salary < 1000])

boxplot(total_price ~ area, data = lvr_prices)
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA
## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

## Warning in x[floor(d)] + x[ceiling(d)]: 整數向上溢位產生了 NA

boxplot(log(total_price) ~ area, data = lvr_prices, cex.axis = 0.5, cex.name = 0.5)
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 1 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 2 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 3 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 4 is not drawn
## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 6 is not drawn

實價登錄分析

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#lvr_prices[ lvr_prices$area == '中山區'  , c('address', 'total_price')   ]

#filter(lvr_prices, area == '中山區')
#select(lvr_prices, address, total_price)

sum(tail(head(iris[iris$Species == 'setosa',])$Sepal.Length, 3))
## [1] 15
iris[iris$Species=='setosa',] %>% head() %>% tail(3) %>% .$Sepal.Length %>% sum()
## [1] 15
lvr_prices %>%
  select(area, total_price) %>%
  filter(area == '中山區') %>%
  head()
## # A tibble: 6 x 2
##     area total_price
##    <chr>       <int>
## 1 中山區     5960000
## 2 中山區    20200000
## 3 中山區     4050000
## 4 中山區     1900000
## 5 中山區    14800000
## 6 中山區    10200000
lvr_prices %>%
  select(area, total_price) %>%
  filter(area == '中山區') %>%
  arrange(total_price) %>%
  head()
## # A tibble: 6 x 2
##     area total_price
##    <chr>       <int>
## 1 中山區           0
## 2 中山區           0
## 3 中山區       10860
## 4 中山區       16000
## 5 中山區       18060
## 6 中山區       21244
lvr_prices %>%
  select(area, total_price) %>%
  filter(area == '中山區') %>%
  arrange(desc(total_price)) %>%
  head()
## # A tibble: 6 x 2
##     area total_price
##    <chr>       <int>
## 1 中山區  1850000000
## 2 中山區  1400000000
## 3 中山區  1084948034
## 4 中山區  1011136500
## 5 中山區   952875000
## 6 中山區   903865500
lvr_prices$trading_ym <- as.Date(format(lvr_prices$trading_ymd, '%Y-%m-01'))

lvr_stat <- lvr_prices %>%
  select(area, trading_ym, total_price) %>%
  filter(trading_ym >= '2012-01-01') %>% 
  group_by(area, trading_ym) %>%
  summarise(overall_price = sum(as.numeric(total_price), na.rm=TRUE))

lvr_stat$area <- as.factor(lvr_stat$area)

par(mfrow=c(3,4))
for (a in levels(lvr_stat$area)){
  zhongshan <- lvr_stat %>% filter(area==a)
plot(overall_price ~ trading_ym, data = zhongshan, type = 'l', main = a)
}

library(tidyr)
price_pivot <- spread(lvr_stat, trading_ym, overall_price, fill=0)
#price_pivot

write.csv(price_pivot, 'taipei_house_price.csv')

分類

#install.packages('rpart')
library(rpart)

download.file('https://raw.githubusercontent.com/ywchiu/feibr/master/data/Training50.csv', 'Training50.csv')

trainset <- read.csv('Training50.csv')
View(trainset)

fit <- rpart(Creditability ~ Account.Balance + Duration.of.Credit..month. + Payment.Status.of.Previous.Credit + Purpose+Credit.Amount+Value.Savings.Stocks+Length.of.current.employment+Instalment.per.cent+Sex...Marital.Status+Guarantors+Duration.in.Current.address+Most.valuable.available.asset+Age..years.+Concurrent.Credits+Type.of.apartment+No.of.Credits.at.this.Bank+Occupation+No.of.dependents+Telephone, data = trainset, method='class')


plot(fit, margin=0.1)
text(fit)

download.file('https://raw.githubusercontent.com/ywchiu/feibr/master/data/Test50.csv', 'Test50.csv')

testset <- read.csv('Test50.csv')


predicted <- predict(fit, testset, type= 'class')
table(testset$Creditability, predicted)
##    predicted
##       0   1
##   0  64  93
##   1  52 291
(291 + 64) / (93 + 52 + 291 + 64)
## [1] 0.71
#install.packages('randomForest')
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
trainset$Creditability <- as.factor(trainset$Creditability)
trainset$X = NULL

forest <-randomForest(Creditability ~., data=trainset, ntree=200, importance=T, proximity=T)

forest.predicted<-predict(forest, testset, type ="class")
table(testset$Creditability,forest.predicted)
##    forest.predicted
##       0   1
##   0  48 109
##   1  23 320
(315  + 46) / (315  + 46 + 111 +28)
## [1] 0.722
#install.packages('ROCR')
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
# 決策樹
predictions1 <-predict(fit, testset, type="prob")
pred.to.roc1 <-predictions1[, 2]
pred.rocr1 <-prediction(pred.to.roc1, as.factor(testset$Creditability))
perf.rocr1 <-performance(pred.rocr1, measure ="auc", x.measure="cutoff")
perf.tpr.rocr1 <-performance(pred.rocr1, "tpr","fpr")

# 隨機森林
predictions2 <-predict(forest, testset, type="prob")
pred.to.roc2 <-predictions2[, 2]
pred.rocr2 <-prediction(pred.to.roc2, as.factor(testset$Creditability))
perf.rocr2 <-performance(pred.rocr2, measure ="auc", x.measure="cutoff")
perf.tpr.rocr2 <-performance(pred.rocr2, "tpr","fpr")

plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart', 'randomforest'), 1:2)
plot(perf.tpr.rocr2, col=2, add=TRUE)

Regression

download.file('https://raw.githubusercontent.com/ywchiu/feibr/master/data/house-prices.csv', 'house-prices.csv')

house_prices <- read.csv('house-prices.csv')
View(house_prices)

house_prices$brick_d<-ifelse(house_prices$Brick=="Yes",1,0)
house_prices$east<-ifelse(house_prices$Neighborhood=="East",1,0)
house_prices$north<-ifelse(house_prices$Neighborhood=="North",1,0)


set.seed(110)
sub<-sample(nrow(house_prices), floor(nrow(house_prices)*0.6))
training_data<-house_prices[sub,]
validation_data<-house_prices[-sub,]

lm.fit1 <-lm(Price ~SqFt+Bathrooms+Bedrooms+Offers+north+east+brick_d, data=training_data)

summary(lm.fit1)
## 
## Call:
## lm(formula = Price ~ SqFt + Bathrooms + Bedrooms + Offers + north + 
##     east + brick_d, data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -28809.1  -5439.8   -251.1   5716.9  26720.7 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  33263.403  13018.025   2.555   0.0129 *  
## SqFt            48.918      7.103   6.887 2.26e-09 ***
## Bathrooms     4886.975   2746.714   1.779   0.0797 .  
## Bedrooms      4352.011   1971.204   2.208   0.0306 *  
## Offers       -5655.299   1314.227  -4.303 5.52e-05 ***
## north       -23296.029   3891.067  -5.987 8.96e-08 ***
## east        -22978.967   3063.771  -7.500 1.77e-10 ***
## brick_d      18500.732   2379.993   7.773 5.65e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9244 on 68 degrees of freedom
## Multiple R-squared:  0.8864, Adjusted R-squared:  0.8747 
## F-statistic: 75.77 on 7 and 68 DF,  p-value: < 2.2e-16
lm.fit1.step <-step(lm.fit1)
## Start:  AIC=1395.57
## Price ~ SqFt + Bathrooms + Bedrooms + Offers + north + east + 
##     brick_d
## 
##             Df  Sum of Sq        RSS    AIC
## <none>                    5.8106e+09 1395.6
## - Bathrooms  1  270499345 6.0811e+09 1397.0
## - Bedrooms   1  416513849 6.2271e+09 1398.8
## - Offers     1 1582279549 7.3929e+09 1411.9
## - north      1 3062944430 8.8736e+09 1425.8
## - SqFt       1 4053393477 9.8640e+09 1433.8
## - east       1 4806858174 1.0617e+10 1439.4
## - brick_d    1 5163439960 1.0974e+10 1441.9
summary(lm.fit1.step)
## 
## Call:
## lm(formula = Price ~ SqFt + Bathrooms + Bedrooms + Offers + north + 
##     east + brick_d, data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -28809.1  -5439.8   -251.1   5716.9  26720.7 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  33263.403  13018.025   2.555   0.0129 *  
## SqFt            48.918      7.103   6.887 2.26e-09 ***
## Bathrooms     4886.975   2746.714   1.779   0.0797 .  
## Bedrooms      4352.011   1971.204   2.208   0.0306 *  
## Offers       -5655.299   1314.227  -4.303 5.52e-05 ***
## north       -23296.029   3891.067  -5.987 8.96e-08 ***
## east        -22978.967   3063.771  -7.500 1.77e-10 ***
## brick_d      18500.732   2379.993   7.773 5.65e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9244 on 68 degrees of freedom
## Multiple R-squared:  0.8864, Adjusted R-squared:  0.8747 
## F-statistic: 75.77 on 7 and 68 DF,  p-value: < 2.2e-16
training_data$predict.price<-predict(lm.fit1)
training_data$error<-residuals(lm.fit1)

validation_data$predict.price<-predict(lm.fit1,newdata=validation_data)
validation_data$error<-validation_data$predict.price-validation_data$Price

hist(training_data$error)

hist(validation_data$error)

a<-cor(training_data$Price,training_data$predict.price)
b<-cor(validation_data$Price,validation_data$predict.price)
a*a
## [1] 0.8863606
b*b
## [1] 0.840097