setwd("~/Desktop/data hw/hw2")
1 a.生成一筆資料:
a <- sample(0:10,20,replace=T)
e <- rnorm(20,0,2)
x <- a+e
x1 <- sample(x[which(0<=x&x<=11)],20,replace=T)
x1
## [1] 2.395214 7.380624 9.611598 10.447257 5.822333 10.447257 5.686412
## [8] 5.039281 5.039281 5.039281 5.686412 4.110476 4.520323 5.039281
## [15] 5.686412 4.520323 5.822333 7.042843 7.042843 10.447257
cauchy<- function(theta,x1){
b<- rep(0,20)
for(i in x1) {
up<- theta-i
ud<- 1+(theta-i)^2
c<- up/ud
b[i]<- c
}
sum<- sum(b)*(-2)
return(sum)
}
c.代入a生成的資料至b的function,並令θ=0.3
cauchy(0.3,x1)
## [1] 2.274594
2 a. 根據Build_year,建立一個新類別變數year_type,1899年以前的房子為“centennial”,1900~1959年為“old”,1960年以上為“new”
houseprice<- read.csv(file="houseprice.csv")
library(tidyverse)
## ─ Attaching packages ──────────────────── tidyverse 1.3.1 ─
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.0.2 ✓ forcats 0.5.1
## ─ Conflicts ───────────────────── tidyverse_conflicts() ─
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
houseprice_Harvard<- houseprice %>% filter(University=="Harvard University") %>%
mutate(Year_type= ifelse(Build_year<=1899,"centennial",
ifelse(Build_year>=1900&Build_year<=1959,"old","new"))) %>%
select(Record,Sale_amount,Sale_date,Beds,Baths,Sqft_home,Sqft_lot,Type,Build_year,Year_type,everything())
houseprice_Harvard #哈佛大學地區的data值
b.決定好你的最佳配適模型後,總結你的發現並根據解釋變數預測房屋價格 因房屋皆在哈佛大學地區,Town相同,不詳細檢驗此變數 我們假設在同一學區中的環境生活品質相似,因此設不同住宅類型的房屋大小為影響房價的主要因子
library(plotly)
##
## 載入套件:'plotly'
## 下列物件被遮斷自 'package:ggplot2':
##
## last_plot
## 下列物件被遮斷自 'package:stats':
##
## filter
## 下列物件被遮斷自 'package:graphics':
##
## layout
attach(houseprice_Harvard)
plot(Sqft_home,Sale_amount,xlab="home size",ylab="sale amount") #散佈圖看似有線性關係
因散佈圖看似有線性關係,我們選擇房屋大小為影響房價的主要因子。
步驟一
(一)、根據房屋大小的分佈畫圖
boxplot(Sqft_home)$out
## [1] 7073
ggplot(houseprice_Harvard, aes(x = Sqft_home, y = Year_type, color = Year_type)) + geom_boxplot()
ggplot(houseprice_Harvard, aes(x = Sqft_home, y = Type, color = Type)) + geom_boxplot()
(二)、用計算四分位距的方式刪除異常點
我們將Sqft_home的1筆outlier刪除,篩選出 1.5 倍四分位距以內的資料。 # 總筆數從66變為65筆資料
houseprice_Harvard1<- houseprice_Harvard[Sqft_home>quantile(Sqft_home,0.25)-1.5*(quantile(Sqft_home,0.75)-quantile(Sqft_home,0.25))& Sqft_home<quantile(Sqft_home,0.75)+1.5*(quantile(Sqft_home,0.75)-quantile(Sqft_home,0.25)),]
houseprice_Harvard1
#沒有outlier的Sqft_home盒狀圖
boxplot(houseprice_Harvard1$Sqft_home)$out
## numeric(0)
步驟二
(一)、觀察房價的分佈情況 #use houseprice_Harvard1 as data
boxplot(houseprice_Harvard1$Sale_amount)$out
## [1] 5000000 4250000 4000000 4000000
ggplot(houseprice_Harvard1, aes(x = Sale_amount, y = Year_type, color = Year_type)) + geom_boxplot()
ggplot(houseprice_Harvard1, aes(x = Sale_amount, y = Type, color = Type)) + geom_boxplot()
(二)、用計算四分位距的方式刪除異常點
我們想移除outlier,刪除4筆資料: Sale_amount== 5000000 4250000 3700000 4000000 4000000 #將Sale_amount的4筆outlier刪除,篩選出 1.5 倍四分位距以內的資料 #總筆數從65變為61筆資料
houseprice_Harvard2<- houseprice_Harvard1[Sale_amount>quantile(Sale_amount,0.25)-1.5*(quantile(Sale_amount,0.75)-quantile(Sale_amount,0.25))&
Sale_amount<quantile(Sale_amount,0.75)+1.5*(quantile(Sale_amount,0.75)-quantile(Sale_amount,0.25)),]
houseprice_Harvard2
(三) 新的Sale_amount盒狀圖 #還有一個outlier值:4e+06
boxplot(houseprice_Harvard2$Sale_amount)$out
## [1] 4e+06
(四)、把NA列去掉 #總筆數從61變為60筆資料
houseprice_Harvard3<-na.omit(houseprice_Harvard2)
houseprice_Harvard3
步驟三
(一)、觀察刪除NA列與outlier的數據– use houseprice_Harvard3 從以下圖表可知,刪除outlier後的數據分配有些許改善
boxplot(houseprice_Harvard3$Sqft_home)$out #沒有離群值了
## numeric(0)
boxplot(houseprice_Harvard3$Sale_amount)$out #還有一個outlier
## [1] 4e+06
3.繪製房屋大小與不同year type的盒狀圖 與先前情況比較,剩下centennial有1個outlier
ggplot(houseprice_Harvard3, aes(x = Sqft_home, y = Year_type, color = Year_type)) + geom_boxplot()
4.繪製房價與不同year type的盒狀圖 與先前情況比較,centennial的outlier都沒了,但old & new還各有1個
ggplot(houseprice_Harvard3, aes(x = Sale_amount, y = Year_type, color = Year_type)) + geom_boxplot()
5.繪製房屋大小與不同房型的盒狀圖 與先前情況相較,outlier個數相同,但range沒有那麼大
ggplot(houseprice_Harvard3, aes(x = Sqft_home, y = Type, color = Type)) + geom_boxplot()
ggplot(houseprice_Harvard3, aes(x = Sale_amount, y = Type, color = Type)) + geom_boxplot()
步驟四
#決定放入什麼交互作用項至模型中 1.不同year type下,房屋大小與銷售價格的關係:呈正相關
ggplot(houseprice_Harvard3, aes(x = Sqft_home, y = Sale_amount, color = Year_type)) + geom_point() + geom_smooth(method = "lm", aes(group = Year_type))
## `geom_smooth()` using formula 'y ~ x'
2.不同beds下,房屋大小與銷售價格的關係:相關性不大(交互作用項不放入)
ggplot(houseprice_Harvard3, aes(x = Sqft_home, y = Sale_amount, color = Beds)) + geom_point() + geom_smooth(method = "lm", aes(group = Beds))
## `geom_smooth()` using formula 'y ~ x'
## Warning in qt((1 - level)/2, df): 產生了 NaNs
## Warning in qt((1 - level)/2, df): 產生了 NaNs
## Warning in qt((1 - level)/2, df): 產生了 NaNs
## Warning in max(ids, na.rm = TRUE): max 中沒有無漏失的引數;回傳 -Inf
## Warning in max(ids, na.rm = TRUE): max 中沒有無漏失的引數;回傳 -Inf
## Warning in max(ids, na.rm = TRUE): max 中沒有無漏失的引數;回傳 -Inf
3.不同baths下,房屋大小與銷售價格的關係:相關性不大(交互作用項不放入)
ggplot(houseprice_Harvard3, aes(x = Sqft_home, y = Sale_amount, color = Baths)) + geom_point() + geom_smooth(method = "lm", aes(group = Baths))
## `geom_smooth()` using formula 'y ~ x'
4.不同sqft_lot下,房屋大小與銷售價格的關係:相關性不大(交互作用項不放入)
ggplot(houseprice_Harvard3, aes(x = Sqft_home, y = Sale_amount, color = Sqft_lot)) + geom_point() + geom_smooth(method = "lm", aes(group = Sqft_lot))
## `geom_smooth()` using formula 'y ~ x'
## Warning in qt((1 - level)/2, df): 產生了 NaNs
## Warning in qt((1 - level)/2, df): 產生了 NaNs
## Warning in qt((1 - level)/2, df): 產生了 NaNs
## Warning in qt((1 - level)/2, df): 產生了 NaNs
## Warning in qt((1 - level)/2, df): 產生了 NaNs
## Warning in qt((1 - level)/2, df): 產生了 NaNs
## Warning in qt((1 - level)/2, df): 產生了 NaNs
## Warning in max(ids, na.rm = TRUE): max 中沒有無漏失的引數;回傳 -Inf
## Warning in max(ids, na.rm = TRUE): max 中沒有無漏失的引數;回傳 -Inf
## Warning in max(ids, na.rm = TRUE): max 中沒有無漏失的引數;回傳 -Inf
## Warning in max(ids, na.rm = TRUE): max 中沒有無漏失的引數;回傳 -Inf
## Warning in max(ids, na.rm = TRUE): max 中沒有無漏失的引數;回傳 -Inf
## Warning in max(ids, na.rm = TRUE): max 中沒有無漏失的引數;回傳 -Inf
## Warning in max(ids, na.rm = TRUE): max 中沒有無漏失的引數;回傳 -Inf
5.不同type下,房屋大小與銷售價格的關係:呈正相關與負相關
ggplot(houseprice_Harvard3, aes(x = Sqft_home, y = Sale_amount, color = Type)) + geom_point() + geom_smooth(method = "lm", aes(group = Type))
## `geom_smooth()` using formula 'y ~ x'
#因此最後放入year type與type的交互作用項
步驟五
(一)、逐步放入各個變數
fit.1 <- lm (Sale_amount~ Sqft_home,data=houseprice_Harvard3)
summary(fit.1)
##
## Call:
## lm(formula = Sale_amount ~ Sqft_home, data = houseprice_Harvard3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1105910 -381274 -185163 196819 1840755
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 431903.4 244414.4 1.767 0.0825 .
## Sqft_home 370.5 86.3 4.293 6.78e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 641800 on 58 degrees of freedom
## Multiple R-squared: 0.2412, Adjusted R-squared: 0.2281
## F-statistic: 18.43 on 1 and 58 DF, p-value: 6.777e-05
# 放入Sqft_home, 顯著相關, Multiple R-squared: 0.2412
fit.2<- lm (Sale_amount~ Sqft_home+Beds,data=houseprice_Harvard3)
summary(fit.2)
##
## Call:
## lm(formula = Sale_amount ~ Sqft_home + Beds, data = houseprice_Harvard3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1242082 -329268 -129097 188978 1720489
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 646234.95 260606.63 2.480 0.0161 *
## Sqft_home 472.56 98.03 4.820 1.1e-05 ***
## Beds -113960.91 56288.64 -2.025 0.0476 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 625300 on 57 degrees of freedom
## Multiple R-squared: 0.2921, Adjusted R-squared: 0.2672
## F-statistic: 11.76 on 2 and 57 DF, p-value: 5.305e-05
# 放入Beds, 顯著相關, Multiple R-squared: 0.2921
fit.3<- lm (Sale_amount~ Sqft_home+Beds+Baths,data=houseprice_Harvard3)
summary(fit.3)
##
## Call:
## lm(formula = Sale_amount ~ Sqft_home + Beds + Baths, data = houseprice_Harvard3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -958778 -357897 -90603 252017 1388301
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 420458.61 233064.72 1.804 0.076604 .
## Sqft_home 275.97 96.66 2.855 0.006024 **
## Beds -180535.19 51403.76 -3.512 0.000887 ***
## Baths 381069.51 87479.55 4.356 5.7e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 545200 on 56 degrees of freedom
## Multiple R-squared: 0.4712, Adjusted R-squared: 0.4429
## F-statistic: 16.64 on 3 and 56 DF, p-value: 7.546e-08
# 放入Baths, 顯著相關, Multiple R-squared: 0.4712
fit.4<- lm (Sale_amount~ Sqft_home+Beds+Baths+Type,data=houseprice_Harvard3)
summary(fit.4)
##
## Call:
## lm(formula = Sale_amount ~ Sqft_home + Beds + Baths + Type, data = houseprice_Harvard3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -870277 -276703 -93728 123799 1201058
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 46501.48 263588.81 0.176 0.86063
## Sqft_home 307.31 92.94 3.307 0.00168 **
## Beds -106040.48 55977.10 -1.894 0.06354 .
## Baths 295453.17 89738.25 3.292 0.00176 **
## TypeMultiple Occupancy -42069.52 201470.87 -0.209 0.83538
## TypeSingle Family 425118.84 176167.84 2.413 0.01924 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 518800 on 54 degrees of freedom
## Multiple R-squared: 0.5383, Adjusted R-squared: 0.4955
## F-statistic: 12.59 on 5 and 54 DF, p-value: 4.045e-08
# 放入Type, 顯著相關, Multiple R-squared: 0.5383
fit.5<- lm (Sale_amount~ Sqft_home+Beds+Baths+Type+Year_type,data=houseprice_Harvard3)
summary(fit.5)
##
## Call:
## lm(formula = Sale_amount ~ Sqft_home + Beds + Baths + Type +
## Year_type, data = houseprice_Harvard3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -756912 -300250 -69693 178753 1053229
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48867.54 257262.68 0.190 0.850086
## Sqft_home 349.86 97.26 3.597 0.000717 ***
## Beds -107992.64 55539.67 -1.944 0.057263 .
## Baths 285623.43 90082.96 3.171 0.002550 **
## TypeMultiple Occupancy 2469.56 202316.83 0.012 0.990308
## TypeSingle Family 550357.48 181302.20 3.036 0.003745 **
## Year_typenew -492871.35 227564.95 -2.166 0.034930 *
## Year_typeold -171642.15 162305.97 -1.058 0.295162
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 506100 on 52 degrees of freedom
## Multiple R-squared: 0.577, Adjusted R-squared: 0.52
## F-statistic: 10.13 on 7 and 52 DF, p-value: 6.377e-08
# 放入Year_type, 顯著相關, Multiple R-squared: 0.577
fit.6<- lm (Sale_amount~ Sqft_home+Beds+Baths+Type+Year_type+Sqft_lot,data=houseprice_Harvard3)
summary(fit.6)
##
## Call:
## lm(formula = Sale_amount ~ Sqft_home + Beds + Baths + Type +
## Year_type + Sqft_lot, data = houseprice_Harvard3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -759121 -275293 -81225 188411 1074011
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.203e+04 2.588e+05 0.124 0.901995
## Sqft_home 3.709e+02 1.007e+02 3.681 0.000561 ***
## Beds -1.073e+05 5.571e+04 -1.925 0.059772 .
## Baths 2.816e+05 9.048e+04 3.112 0.003039 **
## TypeMultiple Occupancy 4.295e+03 2.029e+05 0.021 0.983195
## TypeSingle Family 5.700e+05 1.833e+05 3.109 0.003071 **
## Year_typenew -4.559e+05 2.325e+05 -1.961 0.055386 .
## Year_typeold -1.818e+05 1.632e+05 -1.113 0.270729
## Sqft_lot -7.775e+00 9.317e+00 -0.834 0.407893
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 507500 on 51 degrees of freedom
## Multiple R-squared: 0.5827, Adjusted R-squared: 0.5172
## F-statistic: 8.9 on 8 and 51 DF, p-value: 1.562e-07
# 放入Sqft_lot, 顯著相關, Multiple R-squared: 0.5827
fit.7<- lm (Sale_amount~ Sqft_home*Year_type+Beds+Baths+Sqft_home*Type+Sqft_lot,data=houseprice_Harvard3)
summary(fit.7)
##
## Call:
## lm(formula = Sale_amount ~ Sqft_home * Year_type + Beds + Baths +
## Sqft_home * Type + Sqft_lot, data = houseprice_Harvard3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -751730 -258071 -69849 200730 1119382
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.820e+04 4.233e+05 0.208 0.8358
## Sqft_home 3.739e+02 1.838e+02 2.035 0.0475 *
## Year_typenew -6.060e+05 6.482e+05 -0.935 0.3546
## Year_typeold -4.446e+05 5.121e+05 -0.868 0.3897
## Beds -8.201e+04 6.003e+04 -1.366 0.1784
## Baths 1.988e+05 9.975e+04 1.993 0.0521 .
## TypeMultiple Occupancy 1.807e+06 8.512e+05 2.123 0.0391 *
## TypeSingle Family 5.135e+05 5.355e+05 0.959 0.3425
## Sqft_lot -1.069e+01 9.549e+00 -1.120 0.2685
## Sqft_home:Year_typenew 8.083e+01 2.600e+02 0.311 0.7573
## Sqft_home:Year_typeold 1.090e+02 2.017e+02 0.540 0.5915
## Sqft_home:TypeMultiple Occupancy -5.784e+02 2.844e+02 -2.034 0.0477 *
## Sqft_home:TypeSingle Family 4.584e+01 2.150e+02 0.213 0.8321
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 494300 on 47 degrees of freedom
## Multiple R-squared: 0.6352, Adjusted R-squared: 0.5421
## F-statistic: 6.82 on 12 and 47 DF, p-value: 6.444e-07
# 放入交互作用項 Sqft_home*Year_type & Sqft_home*Type, 顯著相關, Multiple R-squared: 0.6352
(二)、選擇fit.7當作model,並進行stepwise看能不能找到更適合的model ->檢驗Call: lm(formula = Sale_amount ~ Sqft_home + Type + Sqft_lot + Sqft_home:Type, data = houseprice_Harvard7)
library(broom)
tidy(fit.7)
glance(fit.7)
step(fit.7) #use stepwise to find the appropriate model
## Start: AIC=1584.65
## Sale_amount ~ Sqft_home * Year_type + Beds + Baths + Sqft_home *
## Type + Sqft_lot
##
## Df Sum of Sq RSS AIC
## - Sqft_home:Year_type 2 7.1409e+10 1.1555e+13 1581.0
## - Sqft_lot 1 3.0634e+11 1.1790e+13 1584.2
## <none> 1.1483e+13 1584.7
## - Beds 1 4.5598e+11 1.1939e+13 1585.0
## - Baths 1 9.7034e+11 1.2454e+13 1587.5
## - Sqft_home:Type 2 1.6179e+12 1.3101e+13 1588.6
##
## Step: AIC=1581.03
## Sale_amount ~ Sqft_home + Year_type + Beds + Baths + Type + Sqft_lot +
## Sqft_home:Type
##
## Df Sum of Sq RSS AIC
## - Sqft_lot 1 3.2777e+11 1.1882e+13 1580.7
## <none> 1.1555e+13 1581.0
## - Beds 1 4.3243e+11 1.1987e+13 1581.2
## - Year_type 2 9.4931e+11 1.2504e+13 1581.8
## - Baths 1 9.0968e+11 1.2464e+13 1583.6
## - Sqft_home:Type 2 1.5826e+12 1.3137e+13 1584.7
##
## Step: AIC=1580.7
## Sale_amount ~ Sqft_home + Year_type + Beds + Baths + Type + Sqft_home:Type
##
## Df Sum of Sq RSS AIC
## <none> 1.1882e+13 1580.7
## - Beds 1 4.9947e+11 1.2382e+13 1581.2
## - Year_type 2 1.1750e+12 1.3057e+13 1582.4
## - Sqft_home:Type 2 1.4342e+12 1.3317e+13 1583.5
## - Baths 1 1.0633e+12 1.2946e+13 1583.8
##
## Call:
## lm(formula = Sale_amount ~ Sqft_home + Year_type + Beds + Baths +
## Type + Sqft_home:Type, data = houseprice_Harvard3)
##
## Coefficients:
## (Intercept) Sqft_home
## 4888.56 399.80
## Year_typenew Year_typeold
## -484606.92 -167179.22
## Beds Baths
## -85262.77 203453.40
## TypeMultiple Occupancy TypeSingle Family
## 1607629.78 442999.40
## Sqft_home:TypeMultiple Occupancy Sqft_home:TypeSingle Family
## -506.22 67.21
# Step: AIC=1580.7 smaller than usual
# Sale_amount ~ Sqft_home + Year_type + Beds + Baths + Type + Sqft_home:Type
(三)、檢驗fit.8 因其變數顯著較明顯,Multiple R-squared與fit.7差不多,且AIC較小–>選擇fit.8
fit.8 <- lm(Sale_amount ~ Sqft_home + Year_type + Beds + Baths +
Type + Sqft_home:Type, data = houseprice_Harvard3)
summary(fit.8)
##
## Call:
## lm(formula = Sale_amount ~ Sqft_home + Year_type + Beds + Baths +
## Type + Sqft_home:Type, data = houseprice_Harvard3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -703505 -274247 -63082 228978 1091401
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4888.56 401391.32 0.012 0.9903
## Sqft_home 399.80 167.61 2.385 0.0209 *
## Year_typenew -484606.92 219260.70 -2.210 0.0317 *
## Year_typeold -167179.22 156422.73 -1.069 0.2903
## Beds -85262.77 58813.09 -1.450 0.1534
## Baths 203453.40 96186.76 2.115 0.0394 *
## TypeMultiple Occupancy 1607629.78 784355.17 2.050 0.0457 *
## TypeSingle Family 442999.40 470915.95 0.941 0.3514
## Sqft_home:TypeMultiple Occupancy -506.22 253.81 -1.994 0.0516 .
## Sqft_home:TypeSingle Family 67.21 182.72 0.368 0.7145
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 487500 on 50 degrees of freedom
## Multiple R-squared: 0.6225, Adjusted R-squared: 0.5546
## F-statistic: 9.162 on 9 and 50 DF, p-value: 4.954e-08
(四)、plot fit.8 由殘差圖與leverage圖可知資料4、43、46、21 should be delete(可能為outlier)
tidy(fit.8)
glance(fit.8)
new<- augment(fit.8)
resid<- resid(fit.8)#residual
plot(fit.8)
(五)、刪除可能為outlier之值 delete record= 2293, 2310, 2332, 2335 #總筆數從60變為56筆資料 #use houseprice_Harvard7 的data
houseprice_Harvard4<-houseprice_Harvard3[!houseprice_Harvard3$Record=="2293",]#remove no.4
houseprice_Harvard5<-houseprice_Harvard4[!houseprice_Harvard4$Record=="2310",]#remove no.21
houseprice_Harvard6<-houseprice_Harvard5[!houseprice_Harvard5$Record=="2332",]#remove no.43
houseprice_Harvard7<-houseprice_Harvard6[!houseprice_Harvard6$Record=="2335",]#remove no.46
houseprice_Harvard7
步驟六
(一)、重新fit模型
fit.7 得到Multiple R-squared: 0.6764 、Step: AIC=1453
fit.7<- lm (Sale_amount~ Sqft_home*Year_type+Beds+Baths+Sqft_home*Type+Sqft_lot,data=houseprice_Harvard7)
summary(fit.7)
##
## Call:
## lm(formula = Sale_amount ~ Sqft_home * Year_type + Beds + Baths +
## Sqft_home * Type + Sqft_lot, data = houseprice_Harvard7)
##
## Residuals:
## Min 1Q Median 3Q Max
## -698731 -167773 -61730 161936 843765
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.066e+04 3.352e+05 0.062 0.95113
## Sqft_home 4.658e+02 1.463e+02 3.183 0.00271 **
## Year_typenew -1.802e+05 5.322e+05 -0.339 0.73658
## Year_typeold 1.502e+05 4.367e+05 0.344 0.73260
## Beds -4.971e+04 5.011e+04 -0.992 0.32668
## Baths 9.106e+04 8.352e+04 1.090 0.28165
## TypeMultiple Occupancy 8.635e+05 7.532e+05 1.146 0.25796
## TypeSingle Family 1.328e+05 4.432e+05 0.300 0.76584
## Sqft_lot -1.592e+01 7.591e+00 -2.097 0.04190 *
## Sqft_home:Year_typenew -2.352e+01 2.120e+02 -0.111 0.91217
## Sqft_home:Year_typeold -1.327e+02 1.696e+02 -0.782 0.43848
## Sqft_home:TypeMultiple Occupancy -2.700e+02 2.467e+02 -1.095 0.27968
## Sqft_home:TypeSingle Family 1.668e+02 1.786e+02 0.934 0.35547
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 389700 on 43 degrees of freedom
## Multiple R-squared: 0.6764, Adjusted R-squared: 0.586
## F-statistic: 7.488 on 12 and 43 DF, p-value: 3.372e-07
(二)、選擇fit.7當作model,並進行stepwise看能不能找到更適合的model fit 出的最佳model為Step: AIC=1448.14 -> smaller than origin fit.8 #檢驗 Call:lm(formula = Sale_amount ~ Sqft_home + Type + Sqft_lot + Sqft_home:Type, data = houseprice_Harvard7)
library(broom)
tidy(fit.7)
glance(fit.7)
step(fit.7)
## Start: AIC=1453
## Sale_amount ~ Sqft_home * Year_type + Beds + Baths + Sqft_home *
## Type + Sqft_lot
##
## Df Sum of Sq RSS AIC
## - Sqft_home:Year_type 2 1.2332e+11 6.6540e+12 1450.0
## - Beds 1 1.4950e+11 6.6802e+12 1452.3
## - Baths 1 1.8055e+11 6.7112e+12 1452.5
## <none> 6.5307e+12 1453.0
## - Sqft_home:Type 2 7.2543e+11 7.2561e+12 1454.9
## - Sqft_lot 1 6.6797e+11 7.1986e+12 1456.5
##
## Step: AIC=1450.05
## Sale_amount ~ Sqft_home + Year_type + Beds + Baths + Type + Sqft_lot +
## Sqft_home:Type
##
## Df Sum of Sq RSS AIC
## - Year_type 2 3.8711e+11 7.0411e+12 1449.2
## - Beds 1 1.9342e+11 6.8474e+12 1449.7
## <none> 6.6540e+12 1450.0
## - Baths 1 2.5359e+11 6.9076e+12 1450.1
## - Sqft_home:Type 2 8.8099e+11 7.5350e+12 1453.0
## - Sqft_lot 1 6.2818e+11 7.2822e+12 1453.1
##
## Step: AIC=1449.22
## Sale_amount ~ Sqft_home + Beds + Baths + Type + Sqft_lot + Sqft_home:Type
##
## Df Sum of Sq RSS AIC
## - Beds 1 2.1526e+11 7.2564e+12 1448.9
## <none> 7.0411e+12 1449.2
## - Baths 1 3.4671e+11 7.3878e+12 1449.9
## - Sqft_home:Type 2 8.7100e+11 7.9121e+12 1451.8
## - Sqft_lot 1 7.4107e+11 7.7822e+12 1452.8
##
## Step: AIC=1448.9
## Sale_amount ~ Sqft_home + Baths + Type + Sqft_lot + Sqft_home:Type
##
## Df Sum of Sq RSS AIC
## - Baths 1 1.6204e+11 7.4184e+12 1448.1
## <none> 7.2564e+12 1448.9
## - Sqft_lot 1 7.8773e+11 8.0441e+12 1452.7
## - Sqft_home:Type 2 1.0832e+12 8.3395e+12 1452.7
##
## Step: AIC=1448.14
## Sale_amount ~ Sqft_home + Type + Sqft_lot + Sqft_home:Type
##
## Df Sum of Sq RSS AIC
## <none> 7.4184e+12 1448.1
## - Sqft_lot 1 8.4895e+11 8.2673e+12 1452.2
## - Sqft_home:Type 2 1.3992e+12 8.8176e+12 1453.8
##
## Call:
## lm(formula = Sale_amount ~ Sqft_home + Type + Sqft_lot + Sqft_home:Type,
## data = houseprice_Harvard7)
##
## Coefficients:
## (Intercept) Sqft_home
## 169650.45 360.86
## TypeMultiple Occupancy TypeSingle Family
## 996811.52 41854.97
## Sqft_lot Sqft_home:TypeMultiple Occupancy
## -16.59 -332.16
## Sqft_home:TypeSingle Family
## 218.53
(三)、檢驗fit.8
因fit.8的Multiple R-squared: 0.6324 比 fit.7之Multiple R-squared: 0.6764小 而兩者間的AIC僅差4.86,因此我們最後選擇fit.7的模型
fit.8<- lm(formula = Sale_amount ~ Sqft_home + Type + Sqft_lot + Sqft_home:Type,
data = houseprice_Harvard7)
summary(fit.8)
##
## Call:
## lm(formula = Sale_amount ~ Sqft_home + Type + Sqft_lot + Sqft_home:Type,
## data = houseprice_Harvard7)
##
## Residuals:
## Min 1Q Median 3Q Max
## -770029 -267231 6589 161505 1004192
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 169650.450 315043.200 0.538 0.59267
## Sqft_home 360.856 114.580 3.149 0.00279 **
## TypeMultiple Occupancy 996811.518 664822.687 1.499 0.14019
## TypeSingle Family 41854.974 370996.219 0.113 0.91064
## Sqft_lot -16.594 7.008 -2.368 0.02187 *
## Sqft_home:TypeMultiple Occupancy -332.157 210.585 -1.577 0.12116
## Sqft_home:TypeSingle Family 218.534 136.701 1.599 0.11633
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 389100 on 49 degrees of freedom
## Multiple R-squared: 0.6324, Adjusted R-squared: 0.5873
## F-statistic: 14.05 on 6 and 49 DF, p-value: 3.186e-09
步驟七
(一)、最終模型 我們選擇下列模型作為最終的配適model,Multiple R-squared: 0.6764
fit.7<- lm (Sale_amount~ Sqft_home*Year_type+Beds+Baths+Sqft_home*Type+Sqft_lot,data=houseprice_Harvard7)
summary(fit.7)
##
## Call:
## lm(formula = Sale_amount ~ Sqft_home * Year_type + Beds + Baths +
## Sqft_home * Type + Sqft_lot, data = houseprice_Harvard7)
##
## Residuals:
## Min 1Q Median 3Q Max
## -698731 -167773 -61730 161936 843765
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.066e+04 3.352e+05 0.062 0.95113
## Sqft_home 4.658e+02 1.463e+02 3.183 0.00271 **
## Year_typenew -1.802e+05 5.322e+05 -0.339 0.73658
## Year_typeold 1.502e+05 4.367e+05 0.344 0.73260
## Beds -4.971e+04 5.011e+04 -0.992 0.32668
## Baths 9.106e+04 8.352e+04 1.090 0.28165
## TypeMultiple Occupancy 8.635e+05 7.532e+05 1.146 0.25796
## TypeSingle Family 1.328e+05 4.432e+05 0.300 0.76584
## Sqft_lot -1.592e+01 7.591e+00 -2.097 0.04190 *
## Sqft_home:Year_typenew -2.352e+01 2.120e+02 -0.111 0.91217
## Sqft_home:Year_typeold -1.327e+02 1.696e+02 -0.782 0.43848
## Sqft_home:TypeMultiple Occupancy -2.700e+02 2.467e+02 -1.095 0.27968
## Sqft_home:TypeSingle Family 1.668e+02 1.786e+02 0.934 0.35547
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 389700 on 43 degrees of freedom
## Multiple R-squared: 0.6764, Adjusted R-squared: 0.586
## F-statistic: 7.488 on 12 and 43 DF, p-value: 3.372e-07
(二)、plot fit.7 最終殘差圖如下,看起來沒有特別超出的點
plot(fit.7)
(三)、最終模型的詳細資料
tidy(fit.7)
glance(fit.7)