setwd("~/Desktop/data hw/hw2")

1 a.生成一筆資料:

a <- sample(0:10,20,replace=T)
e <- rnorm(20,0,2)
x <- a+e
x1 <- sample(x[which(0<=x&x<=11)],20,replace=T)
x1
##  [1]  2.395214  7.380624  9.611598 10.447257  5.822333 10.447257  5.686412
##  [8]  5.039281  5.039281  5.039281  5.686412  4.110476  4.520323  5.039281
## [15]  5.686412  4.520323  5.822333  7.042843  7.042843 10.447257
  1. Cauchy(θ, 1) 的密度函數,取log後一次微分如下,請寫出此function
cauchy<- function(theta,x1){
    b<- rep(0,20)
    for(i in x1) {
      up<- theta-i
      ud<- 1+(theta-i)^2
      c<- up/ud
      b[i]<- c
   }
    sum<- sum(b)*(-2)
  return(sum)
}

c.代入a生成的資料至b的function,並令θ=0.3

cauchy(0.3,x1)
## [1] 2.274594

2 a. 根據Build_year,建立一個新類別變數year_type,1899年以前的房子為“centennial”,1900~1959年為“old”,1960年以上為“new”

houseprice<- read.csv(file="houseprice.csv")
library(tidyverse)
## ─ Attaching packages ──────────────────── tidyverse 1.3.1 ─
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.4     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.0.2     ✓ forcats 0.5.1
## ─ Conflicts ───────────────────── tidyverse_conflicts() ─
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
houseprice_Harvard<- houseprice %>% filter(University=="Harvard University") %>% 
  mutate(Year_type= ifelse(Build_year<=1899,"centennial",
                           ifelse(Build_year>=1900&Build_year<=1959,"old","new"))) %>% 
  select(Record,Sale_amount,Sale_date,Beds,Baths,Sqft_home,Sqft_lot,Type,Build_year,Year_type,everything())
houseprice_Harvard #哈佛大學地區的data值

b.決定好你的最佳配適模型後,總結你的發現並根據解釋變數預測房屋價格 因房屋皆在哈佛大學地區,Town相同,不詳細檢驗此變數 我們假設在同一學區中的環境生活品質相似,因此設不同住宅類型的房屋大小為影響房價的主要因子

library(plotly)
## 
## 載入套件:'plotly'
## 下列物件被遮斷自 'package:ggplot2':
## 
##     last_plot
## 下列物件被遮斷自 'package:stats':
## 
##     filter
## 下列物件被遮斷自 'package:graphics':
## 
##     layout
attach(houseprice_Harvard)
plot(Sqft_home,Sale_amount,xlab="home size",ylab="sale amount") #散佈圖看似有線性關係

因散佈圖看似有線性關係,我們選擇房屋大小為影響房價的主要因子。

步驟一

(一)、根據房屋大小的分佈畫圖

  1. 繪製房屋大小盒狀圖並觀測其離群值,我們發現有1個outlier=7073
boxplot(Sqft_home)$out

## [1] 7073
  1. 繪製房屋大小與不同year type的盒狀圖,發現centennial有3個outlier
ggplot(houseprice_Harvard, aes(x = Sqft_home, y = Year_type, color = Year_type)) + geom_boxplot() 

  1. 繪製房屋大小與不同房型的盒狀圖,發現single family與 muti family各有1個outlier
ggplot(houseprice_Harvard, aes(x = Sqft_home, y = Type, color = Type)) + geom_boxplot() 

(二)、用計算四分位距的方式刪除異常點

我們將Sqft_home的1筆outlier刪除,篩選出 1.5 倍四分位距以內的資料。 # 總筆數從66變為65筆資料

houseprice_Harvard1<- houseprice_Harvard[Sqft_home>quantile(Sqft_home,0.25)-1.5*(quantile(Sqft_home,0.75)-quantile(Sqft_home,0.25))& Sqft_home<quantile(Sqft_home,0.75)+1.5*(quantile(Sqft_home,0.75)-quantile(Sqft_home,0.25)),]
houseprice_Harvard1

#沒有outlier的Sqft_home盒狀圖

boxplot(houseprice_Harvard1$Sqft_home)$out

## numeric(0)

步驟二

(一)、觀察房價的分佈情況 #use houseprice_Harvard1 as data

  1. 繪製房價散佈圖並觀測其離群值,得到4個outlier: 5000000 4250000 4000000 4000000
boxplot(houseprice_Harvard1$Sale_amount)$out 

## [1] 5000000 4250000 4000000 4000000
  1. 繪製房價與不同year type的盒狀圖,得到old & new各1個outlier、centennial有2個outlier
ggplot(houseprice_Harvard1, aes(x = Sale_amount, y = Year_type, color = Year_type)) + geom_boxplot() 

  1. 繪製房價與不同房型的盒狀圖,我們得到multiple occupancy 跟 multi family各2個outlier
ggplot(houseprice_Harvard1, aes(x = Sale_amount, y = Type, color = Type)) + geom_boxplot() 

(二)、用計算四分位距的方式刪除異常點

我們想移除outlier,刪除4筆資料: Sale_amount== 5000000 4250000 3700000 4000000 4000000 #將Sale_amount的4筆outlier刪除,篩選出 1.5 倍四分位距以內的資料 #總筆數從65變為61筆資料

houseprice_Harvard2<- houseprice_Harvard1[Sale_amount>quantile(Sale_amount,0.25)-1.5*(quantile(Sale_amount,0.75)-quantile(Sale_amount,0.25))&
                                           Sale_amount<quantile(Sale_amount,0.75)+1.5*(quantile(Sale_amount,0.75)-quantile(Sale_amount,0.25)),]
houseprice_Harvard2

(三) 新的Sale_amount盒狀圖 #還有一個outlier值:4e+06

boxplot(houseprice_Harvard2$Sale_amount)$out 

## [1] 4e+06

(四)、把NA列去掉 #總筆數從61變為60筆資料

houseprice_Harvard3<-na.omit(houseprice_Harvard2)
houseprice_Harvard3

步驟三

(一)、觀察刪除NA列與outlier的數據– use houseprice_Harvard3 從以下圖表可知,刪除outlier後的數據分配有些許改善

  1. 繪製刪除outlier的房屋大小盒狀圖
boxplot(houseprice_Harvard3$Sqft_home)$out  #沒有離群值了

## numeric(0)
  1. 繪製刪除outlier的房價盒狀圖
boxplot(houseprice_Harvard3$Sale_amount)$out #還有一個outlier

## [1] 4e+06

3.繪製房屋大小與不同year type的盒狀圖 與先前情況比較,剩下centennial有1個outlier

ggplot(houseprice_Harvard3, aes(x = Sqft_home, y = Year_type, color = Year_type)) + geom_boxplot() 

4.繪製房價與不同year type的盒狀圖 與先前情況比較,centennial的outlier都沒了,但old & new還各有1個

ggplot(houseprice_Harvard3, aes(x = Sale_amount, y = Year_type, color = Year_type)) + geom_boxplot() 

5.繪製房屋大小與不同房型的盒狀圖 與先前情況相較,outlier個數相同,但range沒有那麼大

ggplot(houseprice_Harvard3, aes(x = Sqft_home, y = Type, color = Type)) + geom_boxplot() 

  1. 繪製房價與不同房型的盒狀圖 與先前情況相較,outlier個數相同,但range沒有那麼大
ggplot(houseprice_Harvard3, aes(x = Sale_amount, y = Type, color = Type)) + geom_boxplot()

步驟四

#決定放入什麼交互作用項至模型中 1.不同year type下,房屋大小與銷售價格的關係:呈正相關

ggplot(houseprice_Harvard3, aes(x = Sqft_home, y = Sale_amount, color = Year_type)) + geom_point() + geom_smooth(method = "lm", aes(group = Year_type))
## `geom_smooth()` using formula 'y ~ x'

2.不同beds下,房屋大小與銷售價格的關係:相關性不大(交互作用項不放入)

ggplot(houseprice_Harvard3, aes(x = Sqft_home, y = Sale_amount, color = Beds)) + geom_point() + geom_smooth(method = "lm", aes(group = Beds))
## `geom_smooth()` using formula 'y ~ x'
## Warning in qt((1 - level)/2, df): 產生了 NaNs

## Warning in qt((1 - level)/2, df): 產生了 NaNs

## Warning in qt((1 - level)/2, df): 產生了 NaNs
## Warning in max(ids, na.rm = TRUE): max 中沒有無漏失的引數;回傳 -Inf

## Warning in max(ids, na.rm = TRUE): max 中沒有無漏失的引數;回傳 -Inf

## Warning in max(ids, na.rm = TRUE): max 中沒有無漏失的引數;回傳 -Inf

3.不同baths下,房屋大小與銷售價格的關係:相關性不大(交互作用項不放入)

ggplot(houseprice_Harvard3, aes(x = Sqft_home, y = Sale_amount, color = Baths)) + geom_point() + geom_smooth(method = "lm", aes(group = Baths))
## `geom_smooth()` using formula 'y ~ x'

4.不同sqft_lot下,房屋大小與銷售價格的關係:相關性不大(交互作用項不放入)

ggplot(houseprice_Harvard3, aes(x = Sqft_home, y = Sale_amount, color = Sqft_lot)) + geom_point() + geom_smooth(method = "lm", aes(group = Sqft_lot))
## `geom_smooth()` using formula 'y ~ x'
## Warning in qt((1 - level)/2, df): 產生了 NaNs

## Warning in qt((1 - level)/2, df): 產生了 NaNs

## Warning in qt((1 - level)/2, df): 產生了 NaNs

## Warning in qt((1 - level)/2, df): 產生了 NaNs

## Warning in qt((1 - level)/2, df): 產生了 NaNs

## Warning in qt((1 - level)/2, df): 產生了 NaNs

## Warning in qt((1 - level)/2, df): 產生了 NaNs
## Warning in max(ids, na.rm = TRUE): max 中沒有無漏失的引數;回傳 -Inf

## Warning in max(ids, na.rm = TRUE): max 中沒有無漏失的引數;回傳 -Inf

## Warning in max(ids, na.rm = TRUE): max 中沒有無漏失的引數;回傳 -Inf

## Warning in max(ids, na.rm = TRUE): max 中沒有無漏失的引數;回傳 -Inf

## Warning in max(ids, na.rm = TRUE): max 中沒有無漏失的引數;回傳 -Inf

## Warning in max(ids, na.rm = TRUE): max 中沒有無漏失的引數;回傳 -Inf

## Warning in max(ids, na.rm = TRUE): max 中沒有無漏失的引數;回傳 -Inf

5.不同type下,房屋大小與銷售價格的關係:呈正相關與負相關

ggplot(houseprice_Harvard3, aes(x = Sqft_home, y = Sale_amount, color = Type)) + geom_point() + geom_smooth(method = "lm", aes(group = Type))
## `geom_smooth()` using formula 'y ~ x'

#因此最後放入year type與type的交互作用項

步驟五

(一)、逐步放入各個變數

fit.1 <- lm (Sale_amount~ Sqft_home,data=houseprice_Harvard3)
summary(fit.1)
## 
## Call:
## lm(formula = Sale_amount ~ Sqft_home, data = houseprice_Harvard3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1105910  -381274  -185163   196819  1840755 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 431903.4   244414.4   1.767   0.0825 .  
## Sqft_home      370.5       86.3   4.293 6.78e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 641800 on 58 degrees of freedom
## Multiple R-squared:  0.2412, Adjusted R-squared:  0.2281 
## F-statistic: 18.43 on 1 and 58 DF,  p-value: 6.777e-05
# 放入Sqft_home, 顯著相關, Multiple R-squared:  0.2412

fit.2<- lm (Sale_amount~ Sqft_home+Beds,data=houseprice_Harvard3)
summary(fit.2)
## 
## Call:
## lm(formula = Sale_amount ~ Sqft_home + Beds, data = houseprice_Harvard3)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1242082  -329268  -129097   188978  1720489 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  646234.95  260606.63   2.480   0.0161 *  
## Sqft_home       472.56      98.03   4.820  1.1e-05 ***
## Beds        -113960.91   56288.64  -2.025   0.0476 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 625300 on 57 degrees of freedom
## Multiple R-squared:  0.2921, Adjusted R-squared:  0.2672 
## F-statistic: 11.76 on 2 and 57 DF,  p-value: 5.305e-05
# 放入Beds, 顯著相關, Multiple R-squared:  0.2921

fit.3<- lm (Sale_amount~ Sqft_home+Beds+Baths,data=houseprice_Harvard3)
summary(fit.3)
## 
## Call:
## lm(formula = Sale_amount ~ Sqft_home + Beds + Baths, data = houseprice_Harvard3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -958778 -357897  -90603  252017 1388301 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  420458.61  233064.72   1.804 0.076604 .  
## Sqft_home       275.97      96.66   2.855 0.006024 ** 
## Beds        -180535.19   51403.76  -3.512 0.000887 ***
## Baths        381069.51   87479.55   4.356  5.7e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 545200 on 56 degrees of freedom
## Multiple R-squared:  0.4712, Adjusted R-squared:  0.4429 
## F-statistic: 16.64 on 3 and 56 DF,  p-value: 7.546e-08
# 放入Baths, 顯著相關, Multiple R-squared: 0.4712

fit.4<- lm (Sale_amount~ Sqft_home+Beds+Baths+Type,data=houseprice_Harvard3)
summary(fit.4)
## 
## Call:
## lm(formula = Sale_amount ~ Sqft_home + Beds + Baths + Type, data = houseprice_Harvard3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -870277 -276703  -93728  123799 1201058 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)   
## (Intercept)              46501.48  263588.81   0.176  0.86063   
## Sqft_home                  307.31      92.94   3.307  0.00168 **
## Beds                   -106040.48   55977.10  -1.894  0.06354 . 
## Baths                   295453.17   89738.25   3.292  0.00176 **
## TypeMultiple Occupancy  -42069.52  201470.87  -0.209  0.83538   
## TypeSingle Family       425118.84  176167.84   2.413  0.01924 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 518800 on 54 degrees of freedom
## Multiple R-squared:  0.5383, Adjusted R-squared:  0.4955 
## F-statistic: 12.59 on 5 and 54 DF,  p-value: 4.045e-08
# 放入Type, 顯著相關, Multiple R-squared: 0.5383

fit.5<- lm (Sale_amount~ Sqft_home+Beds+Baths+Type+Year_type,data=houseprice_Harvard3)
summary(fit.5)
## 
## Call:
## lm(formula = Sale_amount ~ Sqft_home + Beds + Baths + Type + 
##     Year_type, data = houseprice_Harvard3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -756912 -300250  -69693  178753 1053229 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              48867.54  257262.68   0.190 0.850086    
## Sqft_home                  349.86      97.26   3.597 0.000717 ***
## Beds                   -107992.64   55539.67  -1.944 0.057263 .  
## Baths                   285623.43   90082.96   3.171 0.002550 ** 
## TypeMultiple Occupancy    2469.56  202316.83   0.012 0.990308    
## TypeSingle Family       550357.48  181302.20   3.036 0.003745 ** 
## Year_typenew           -492871.35  227564.95  -2.166 0.034930 *  
## Year_typeold           -171642.15  162305.97  -1.058 0.295162    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 506100 on 52 degrees of freedom
## Multiple R-squared:  0.577,  Adjusted R-squared:   0.52 
## F-statistic: 10.13 on 7 and 52 DF,  p-value: 6.377e-08
# 放入Year_type, 顯著相關, Multiple R-squared: 0.577

fit.6<- lm (Sale_amount~ Sqft_home+Beds+Baths+Type+Year_type+Sqft_lot,data=houseprice_Harvard3)
summary(fit.6)
## 
## Call:
## lm(formula = Sale_amount ~ Sqft_home + Beds + Baths + Type + 
##     Year_type + Sqft_lot, data = houseprice_Harvard3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -759121 -275293  -81225  188411 1074011 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             3.203e+04  2.588e+05   0.124 0.901995    
## Sqft_home               3.709e+02  1.007e+02   3.681 0.000561 ***
## Beds                   -1.073e+05  5.571e+04  -1.925 0.059772 .  
## Baths                   2.816e+05  9.048e+04   3.112 0.003039 ** 
## TypeMultiple Occupancy  4.295e+03  2.029e+05   0.021 0.983195    
## TypeSingle Family       5.700e+05  1.833e+05   3.109 0.003071 ** 
## Year_typenew           -4.559e+05  2.325e+05  -1.961 0.055386 .  
## Year_typeold           -1.818e+05  1.632e+05  -1.113 0.270729    
## Sqft_lot               -7.775e+00  9.317e+00  -0.834 0.407893    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 507500 on 51 degrees of freedom
## Multiple R-squared:  0.5827, Adjusted R-squared:  0.5172 
## F-statistic:   8.9 on 8 and 51 DF,  p-value: 1.562e-07
# 放入Sqft_lot, 顯著相關, Multiple R-squared: 0.5827

fit.7<- lm (Sale_amount~ Sqft_home*Year_type+Beds+Baths+Sqft_home*Type+Sqft_lot,data=houseprice_Harvard3)
summary(fit.7)
## 
## Call:
## lm(formula = Sale_amount ~ Sqft_home * Year_type + Beds + Baths + 
##     Sqft_home * Type + Sqft_lot, data = houseprice_Harvard3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -751730 -258071  -69849  200730 1119382 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                       8.820e+04  4.233e+05   0.208   0.8358  
## Sqft_home                         3.739e+02  1.838e+02   2.035   0.0475 *
## Year_typenew                     -6.060e+05  6.482e+05  -0.935   0.3546  
## Year_typeold                     -4.446e+05  5.121e+05  -0.868   0.3897  
## Beds                             -8.201e+04  6.003e+04  -1.366   0.1784  
## Baths                             1.988e+05  9.975e+04   1.993   0.0521 .
## TypeMultiple Occupancy            1.807e+06  8.512e+05   2.123   0.0391 *
## TypeSingle Family                 5.135e+05  5.355e+05   0.959   0.3425  
## Sqft_lot                         -1.069e+01  9.549e+00  -1.120   0.2685  
## Sqft_home:Year_typenew            8.083e+01  2.600e+02   0.311   0.7573  
## Sqft_home:Year_typeold            1.090e+02  2.017e+02   0.540   0.5915  
## Sqft_home:TypeMultiple Occupancy -5.784e+02  2.844e+02  -2.034   0.0477 *
## Sqft_home:TypeSingle Family       4.584e+01  2.150e+02   0.213   0.8321  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 494300 on 47 degrees of freedom
## Multiple R-squared:  0.6352, Adjusted R-squared:  0.5421 
## F-statistic:  6.82 on 12 and 47 DF,  p-value: 6.444e-07
# 放入交互作用項 Sqft_home*Year_type & Sqft_home*Type, 顯著相關, Multiple R-squared:  0.6352

(二)、選擇fit.7當作model,並進行stepwise看能不能找到更適合的model ->檢驗Call: lm(formula = Sale_amount ~ Sqft_home + Type + Sqft_lot + Sqft_home:Type, data = houseprice_Harvard7)

library(broom)
tidy(fit.7)
glance(fit.7)
step(fit.7) #use stepwise to find the appropriate model
## Start:  AIC=1584.65
## Sale_amount ~ Sqft_home * Year_type + Beds + Baths + Sqft_home * 
##     Type + Sqft_lot
## 
##                       Df  Sum of Sq        RSS    AIC
## - Sqft_home:Year_type  2 7.1409e+10 1.1555e+13 1581.0
## - Sqft_lot             1 3.0634e+11 1.1790e+13 1584.2
## <none>                              1.1483e+13 1584.7
## - Beds                 1 4.5598e+11 1.1939e+13 1585.0
## - Baths                1 9.7034e+11 1.2454e+13 1587.5
## - Sqft_home:Type       2 1.6179e+12 1.3101e+13 1588.6
## 
## Step:  AIC=1581.03
## Sale_amount ~ Sqft_home + Year_type + Beds + Baths + Type + Sqft_lot + 
##     Sqft_home:Type
## 
##                  Df  Sum of Sq        RSS    AIC
## - Sqft_lot        1 3.2777e+11 1.1882e+13 1580.7
## <none>                         1.1555e+13 1581.0
## - Beds            1 4.3243e+11 1.1987e+13 1581.2
## - Year_type       2 9.4931e+11 1.2504e+13 1581.8
## - Baths           1 9.0968e+11 1.2464e+13 1583.6
## - Sqft_home:Type  2 1.5826e+12 1.3137e+13 1584.7
## 
## Step:  AIC=1580.7
## Sale_amount ~ Sqft_home + Year_type + Beds + Baths + Type + Sqft_home:Type
## 
##                  Df  Sum of Sq        RSS    AIC
## <none>                         1.1882e+13 1580.7
## - Beds            1 4.9947e+11 1.2382e+13 1581.2
## - Year_type       2 1.1750e+12 1.3057e+13 1582.4
## - Sqft_home:Type  2 1.4342e+12 1.3317e+13 1583.5
## - Baths           1 1.0633e+12 1.2946e+13 1583.8
## 
## Call:
## lm(formula = Sale_amount ~ Sqft_home + Year_type + Beds + Baths + 
##     Type + Sqft_home:Type, data = houseprice_Harvard3)
## 
## Coefficients:
##                      (Intercept)                         Sqft_home  
##                          4888.56                            399.80  
##                     Year_typenew                      Year_typeold  
##                       -484606.92                        -167179.22  
##                             Beds                             Baths  
##                        -85262.77                         203453.40  
##           TypeMultiple Occupancy                 TypeSingle Family  
##                       1607629.78                         442999.40  
## Sqft_home:TypeMultiple Occupancy       Sqft_home:TypeSingle Family  
##                          -506.22                             67.21
# Step:  AIC=1580.7 smaller than usual
# Sale_amount ~ Sqft_home + Year_type + Beds + Baths + Type + Sqft_home:Type

(三)、檢驗fit.8 因其變數顯著較明顯,Multiple R-squared與fit.7差不多,且AIC較小–>選擇fit.8

fit.8 <- lm(Sale_amount ~ Sqft_home + Year_type + Beds + Baths + 
              Type + Sqft_home:Type, data = houseprice_Harvard3)
summary(fit.8)
## 
## Call:
## lm(formula = Sale_amount ~ Sqft_home + Year_type + Beds + Baths + 
##     Type + Sqft_home:Type, data = houseprice_Harvard3)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -703505 -274247  -63082  228978 1091401 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                         4888.56  401391.32   0.012   0.9903  
## Sqft_home                            399.80     167.61   2.385   0.0209 *
## Year_typenew                     -484606.92  219260.70  -2.210   0.0317 *
## Year_typeold                     -167179.22  156422.73  -1.069   0.2903  
## Beds                              -85262.77   58813.09  -1.450   0.1534  
## Baths                             203453.40   96186.76   2.115   0.0394 *
## TypeMultiple Occupancy           1607629.78  784355.17   2.050   0.0457 *
## TypeSingle Family                 442999.40  470915.95   0.941   0.3514  
## Sqft_home:TypeMultiple Occupancy    -506.22     253.81  -1.994   0.0516 .
## Sqft_home:TypeSingle Family           67.21     182.72   0.368   0.7145  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 487500 on 50 degrees of freedom
## Multiple R-squared:  0.6225, Adjusted R-squared:  0.5546 
## F-statistic: 9.162 on 9 and 50 DF,  p-value: 4.954e-08

(四)、plot fit.8 由殘差圖與leverage圖可知資料4、43、46、21 should be delete(可能為outlier)

tidy(fit.8)
glance(fit.8)
new<- augment(fit.8)
resid<- resid(fit.8)#residual
plot(fit.8)

(五)、刪除可能為outlier之值 delete record= 2293, 2310, 2332, 2335 #總筆數從60變為56筆資料 #use houseprice_Harvard7 的data

houseprice_Harvard4<-houseprice_Harvard3[!houseprice_Harvard3$Record=="2293",]#remove no.4
houseprice_Harvard5<-houseprice_Harvard4[!houseprice_Harvard4$Record=="2310",]#remove no.21
houseprice_Harvard6<-houseprice_Harvard5[!houseprice_Harvard5$Record=="2332",]#remove no.43
houseprice_Harvard7<-houseprice_Harvard6[!houseprice_Harvard6$Record=="2335",]#remove no.46
houseprice_Harvard7

步驟六

(一)、重新fit模型

fit.7 得到Multiple R-squared: 0.6764 、Step: AIC=1453

fit.7<- lm (Sale_amount~ Sqft_home*Year_type+Beds+Baths+Sqft_home*Type+Sqft_lot,data=houseprice_Harvard7)
summary(fit.7)
## 
## Call:
## lm(formula = Sale_amount ~ Sqft_home * Year_type + Beds + Baths + 
##     Sqft_home * Type + Sqft_lot, data = houseprice_Harvard7)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -698731 -167773  -61730  161936  843765 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                       2.066e+04  3.352e+05   0.062  0.95113   
## Sqft_home                         4.658e+02  1.463e+02   3.183  0.00271 **
## Year_typenew                     -1.802e+05  5.322e+05  -0.339  0.73658   
## Year_typeold                      1.502e+05  4.367e+05   0.344  0.73260   
## Beds                             -4.971e+04  5.011e+04  -0.992  0.32668   
## Baths                             9.106e+04  8.352e+04   1.090  0.28165   
## TypeMultiple Occupancy            8.635e+05  7.532e+05   1.146  0.25796   
## TypeSingle Family                 1.328e+05  4.432e+05   0.300  0.76584   
## Sqft_lot                         -1.592e+01  7.591e+00  -2.097  0.04190 * 
## Sqft_home:Year_typenew           -2.352e+01  2.120e+02  -0.111  0.91217   
## Sqft_home:Year_typeold           -1.327e+02  1.696e+02  -0.782  0.43848   
## Sqft_home:TypeMultiple Occupancy -2.700e+02  2.467e+02  -1.095  0.27968   
## Sqft_home:TypeSingle Family       1.668e+02  1.786e+02   0.934  0.35547   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 389700 on 43 degrees of freedom
## Multiple R-squared:  0.6764, Adjusted R-squared:  0.586 
## F-statistic: 7.488 on 12 and 43 DF,  p-value: 3.372e-07

(二)、選擇fit.7當作model,並進行stepwise看能不能找到更適合的model fit 出的最佳model為Step: AIC=1448.14 -> smaller than origin fit.8 #檢驗 Call:lm(formula = Sale_amount ~ Sqft_home + Type + Sqft_lot + Sqft_home:Type, data = houseprice_Harvard7)

library(broom)
tidy(fit.7)
glance(fit.7)
step(fit.7)
## Start:  AIC=1453
## Sale_amount ~ Sqft_home * Year_type + Beds + Baths + Sqft_home * 
##     Type + Sqft_lot
## 
##                       Df  Sum of Sq        RSS    AIC
## - Sqft_home:Year_type  2 1.2332e+11 6.6540e+12 1450.0
## - Beds                 1 1.4950e+11 6.6802e+12 1452.3
## - Baths                1 1.8055e+11 6.7112e+12 1452.5
## <none>                              6.5307e+12 1453.0
## - Sqft_home:Type       2 7.2543e+11 7.2561e+12 1454.9
## - Sqft_lot             1 6.6797e+11 7.1986e+12 1456.5
## 
## Step:  AIC=1450.05
## Sale_amount ~ Sqft_home + Year_type + Beds + Baths + Type + Sqft_lot + 
##     Sqft_home:Type
## 
##                  Df  Sum of Sq        RSS    AIC
## - Year_type       2 3.8711e+11 7.0411e+12 1449.2
## - Beds            1 1.9342e+11 6.8474e+12 1449.7
## <none>                         6.6540e+12 1450.0
## - Baths           1 2.5359e+11 6.9076e+12 1450.1
## - Sqft_home:Type  2 8.8099e+11 7.5350e+12 1453.0
## - Sqft_lot        1 6.2818e+11 7.2822e+12 1453.1
## 
## Step:  AIC=1449.22
## Sale_amount ~ Sqft_home + Beds + Baths + Type + Sqft_lot + Sqft_home:Type
## 
##                  Df  Sum of Sq        RSS    AIC
## - Beds            1 2.1526e+11 7.2564e+12 1448.9
## <none>                         7.0411e+12 1449.2
## - Baths           1 3.4671e+11 7.3878e+12 1449.9
## - Sqft_home:Type  2 8.7100e+11 7.9121e+12 1451.8
## - Sqft_lot        1 7.4107e+11 7.7822e+12 1452.8
## 
## Step:  AIC=1448.9
## Sale_amount ~ Sqft_home + Baths + Type + Sqft_lot + Sqft_home:Type
## 
##                  Df  Sum of Sq        RSS    AIC
## - Baths           1 1.6204e+11 7.4184e+12 1448.1
## <none>                         7.2564e+12 1448.9
## - Sqft_lot        1 7.8773e+11 8.0441e+12 1452.7
## - Sqft_home:Type  2 1.0832e+12 8.3395e+12 1452.7
## 
## Step:  AIC=1448.14
## Sale_amount ~ Sqft_home + Type + Sqft_lot + Sqft_home:Type
## 
##                  Df  Sum of Sq        RSS    AIC
## <none>                         7.4184e+12 1448.1
## - Sqft_lot        1 8.4895e+11 8.2673e+12 1452.2
## - Sqft_home:Type  2 1.3992e+12 8.8176e+12 1453.8
## 
## Call:
## lm(formula = Sale_amount ~ Sqft_home + Type + Sqft_lot + Sqft_home:Type, 
##     data = houseprice_Harvard7)
## 
## Coefficients:
##                      (Intercept)                         Sqft_home  
##                        169650.45                            360.86  
##           TypeMultiple Occupancy                 TypeSingle Family  
##                        996811.52                          41854.97  
##                         Sqft_lot  Sqft_home:TypeMultiple Occupancy  
##                           -16.59                           -332.16  
##      Sqft_home:TypeSingle Family  
##                           218.53

(三)、檢驗fit.8

因fit.8的Multiple R-squared: 0.6324 比 fit.7之Multiple R-squared: 0.6764小 而兩者間的AIC僅差4.86,因此我們最後選擇fit.7的模型

fit.8<- lm(formula = Sale_amount ~ Sqft_home + Type + Sqft_lot + Sqft_home:Type, 
           data = houseprice_Harvard7)
summary(fit.8)
## 
## Call:
## lm(formula = Sale_amount ~ Sqft_home + Type + Sqft_lot + Sqft_home:Type, 
##     data = houseprice_Harvard7)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -770029 -267231    6589  161505 1004192 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                      169650.450 315043.200   0.538  0.59267   
## Sqft_home                           360.856    114.580   3.149  0.00279 **
## TypeMultiple Occupancy           996811.518 664822.687   1.499  0.14019   
## TypeSingle Family                 41854.974 370996.219   0.113  0.91064   
## Sqft_lot                            -16.594      7.008  -2.368  0.02187 * 
## Sqft_home:TypeMultiple Occupancy   -332.157    210.585  -1.577  0.12116   
## Sqft_home:TypeSingle Family         218.534    136.701   1.599  0.11633   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 389100 on 49 degrees of freedom
## Multiple R-squared:  0.6324, Adjusted R-squared:  0.5873 
## F-statistic: 14.05 on 6 and 49 DF,  p-value: 3.186e-09

步驟七

(一)、最終模型 我們選擇下列模型作為最終的配適model,Multiple R-squared: 0.6764

fit.7<- lm (Sale_amount~ Sqft_home*Year_type+Beds+Baths+Sqft_home*Type+Sqft_lot,data=houseprice_Harvard7)
summary(fit.7)
## 
## Call:
## lm(formula = Sale_amount ~ Sqft_home * Year_type + Beds + Baths + 
##     Sqft_home * Type + Sqft_lot, data = houseprice_Harvard7)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -698731 -167773  -61730  161936  843765 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                       2.066e+04  3.352e+05   0.062  0.95113   
## Sqft_home                         4.658e+02  1.463e+02   3.183  0.00271 **
## Year_typenew                     -1.802e+05  5.322e+05  -0.339  0.73658   
## Year_typeold                      1.502e+05  4.367e+05   0.344  0.73260   
## Beds                             -4.971e+04  5.011e+04  -0.992  0.32668   
## Baths                             9.106e+04  8.352e+04   1.090  0.28165   
## TypeMultiple Occupancy            8.635e+05  7.532e+05   1.146  0.25796   
## TypeSingle Family                 1.328e+05  4.432e+05   0.300  0.76584   
## Sqft_lot                         -1.592e+01  7.591e+00  -2.097  0.04190 * 
## Sqft_home:Year_typenew           -2.352e+01  2.120e+02  -0.111  0.91217   
## Sqft_home:Year_typeold           -1.327e+02  1.696e+02  -0.782  0.43848   
## Sqft_home:TypeMultiple Occupancy -2.700e+02  2.467e+02  -1.095  0.27968   
## Sqft_home:TypeSingle Family       1.668e+02  1.786e+02   0.934  0.35547   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 389700 on 43 degrees of freedom
## Multiple R-squared:  0.6764, Adjusted R-squared:  0.586 
## F-statistic: 7.488 on 12 and 43 DF,  p-value: 3.372e-07

(二)、plot fit.7 最終殘差圖如下,看起來沒有特別超出的點

plot(fit.7)

(三)、最終模型的詳細資料

tidy(fit.7)
glance(fit.7)