Regression

X <-  c(17, 21, 35, 39, 50, 65)
Y <-  c(132, 150, 160, 162, 149, 170)
Y_avg <- mean(Y)
X_avg <- mean(X)
B1 <- (sum(X*Y) - 6 * X_avg * Y_avg) / (sum(X^2) - 6 * X_avg  ^ 2)
B0 <- Y_avg - B1 * X_avg
B0
## [1] 132.913
B1
## [1] 0.5529606

Linear Regression

data(anscombe)
View(anscombe)
# method 1
plot(anscombe$x1, anscombe$y1)

# method 2
plot(y1 ~ x1, data = anscombe )
fit <- lm(y1 ~ x1, data = anscombe)

fit
## 
## Call:
## lm(formula = y1 ~ x1, data = anscombe)
## 
## Coefficients:
## (Intercept)           x1  
##      3.0001       0.5001
summary(fit)
## 
## Call:
## lm(formula = y1 ~ x1, data = anscombe)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.92127 -0.45577 -0.04136  0.70941  1.83882 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   3.0001     1.1247   2.667  0.02573 * 
## x1            0.5001     0.1179   4.241  0.00217 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared:  0.6665, Adjusted R-squared:  0.6295 
## F-statistic: 17.99 on 1 and 9 DF,  p-value: 0.00217
plot(y1 ~ x1, data = anscombe )
abline(fit, col="red")

predict(fit, data.frame(x1 = 14))
##        1 
## 10.00136
predict(fit, data.frame(x1 = 16))
##        1 
## 11.00155
fit <- lm(y2 ~ x1, data = anscombe)
plot(y2 ~ x1, data = anscombe)
abline(fit, col= "red")

summary(fit)
## 
## Call:
## lm(formula = y2 ~ x1, data = anscombe)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9009 -0.7609  0.1291  0.9491  1.2691 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)    3.001      1.125   2.667  0.02576 * 
## x1             0.500      0.118   4.239  0.00218 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.237 on 9 degrees of freedom
## Multiple R-squared:  0.6662, Adjusted R-squared:  0.6292 
## F-statistic: 17.97 on 1 and 9 DF,  p-value: 0.002179
fit2 <- lm(y2 ~ poly(x1, 2), data = anscombe)
summary(fit2)
## 
## Call:
## lm(formula = y2 ~ poly(x1, 2), data = anscombe)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0013287 -0.0011888 -0.0006294  0.0008741  0.0023776 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7.5009091  0.0005043   14875   <2e-16 ***
## poly(x1, 2)1  5.2440442  0.0016725    3135   <2e-16 ***
## poly(x1, 2)2 -3.7116396  0.0016725   -2219   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.001672 on 8 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 7.378e+06 on 2 and 8 DF,  p-value: < 2.2e-16
plot(y2 ~ x1, data = anscombe)
lines(sort(anscombe$x1), fit2$fitted.values[order(anscombe$x1)], col="red")

## RLM

fit3 <- lm(y3 ~ x1, data = anscombe)
plot(y3 ~ x1, data = anscombe)
fit3 <- lm(y3 ~ x1, data = anscombe)
abline(fit3, col = "red")

#install.packages("MASS")
library(MASS)
## Warning: package 'MASS' was built under R version 3.4.3
fit4 <- rlm(y3 ~ x1, data = anscombe)
plot(y3 ~ x1, data = anscombe)
abline(fit4, col = 'red')

Perform Regression on House Pricing Data

house_prices <- read.csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/house-prices.csv')

head(house_prices)
##   Home  Price SqFt Bedrooms Bathrooms Offers Brick Neighborhood
## 1    1 114300 1790        2         2      2    No         East
## 2    2 114200 2030        4         2      3    No         East
## 3    3 114800 1740        3         2      1    No         East
## 4    4  94700 1980        3         2      3    No         East
## 5    5 119800 2130        3         3      3    No         East
## 6    6 114600 1780        3         2      2    No        North
plot(Price ~ SqFt, data = house_prices)
fit <- lm(Price ~ SqFt, data = house_prices)
abline(fit, col= "red")

## 591 租屋價格搜集

library(httr)

dfall <- data.frame()
for(pagecnt in 0:24){
  res <- GET(paste0('https://rent.591.com.tw/home/search/rsList?is_new_list=1&type=1&kind=1&searchtype=1&region=1&section=5&firstRow=', pagecnt * 30 ,'&totalRows=711'))
  house <- content(res)
  print(pagecnt)

  for( item in house$data$data){
    df <- data.frame(address = item$address, price = item$price, area= item$area)
    dfall <- rbind(dfall, df)
  }
}
head(dfall)
str(dfall)

591 租屋價格處理

dfall$price <- as.integer(sub(pattern=',', replacement = '',dfall$price))
write.csv(x =dfall, file = 'rent_591_sample.csv')

591 租屋價格預測

rent<- read.csv('rent_591_sample.csv')
head(rent)
##   X                    address  price area
## 1 1  建國南路二段151巷大安森..  50000   45
## 2 2   大安路一段‧正兩房‧電梯..  25000   18
## 3 3 新生南路二段正面大安森林.. 200000  151
## 4 4  復興南路一段219巷忠孝復..  48000   36
## 5 5       和平東路二段兩房一..  25009   25
## 6 6       光復南路光復南路仁..  22914   20
plot(price ~ area, data = rent)
fit <- lm(price ~ area, data = rent)
fit
## 
## Call:
## lm(formula = price ~ area, data = rent)
## 
## Coefficients:
## (Intercept)         area  
##       485.6       1484.5
abline(fit, col="red")

predict(fit, data.frame(area = 20))
##        1 
## 30176.15

Normal Diestribution

 curve(dnorm(x), -5, 5, col="black") 

Perform Regression on House Pricing Data (2)

house_prices <- read.csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/house-prices.csv')

head(house_prices)
##   Home  Price SqFt Bedrooms Bathrooms Offers Brick Neighborhood
## 1    1 114300 1790        2         2      2    No         East
## 2    2 114200 2030        4         2      3    No         East
## 3    3 114800 1740        3         2      1    No         East
## 4    4  94700 1980        3         2      3    No         East
## 5    5 119800 2130        3         3      3    No         East
## 6    6 114600 1780        3         2      2    No        North
plot(Price ~ SqFt, data = house_prices)

fit <- lm(Price ~ SqFt + Bedrooms + Bathrooms + Offers, data = house_prices)

summary(fit)
## 
## Call:
## lm(formula = Price ~ SqFt + Bedrooms + Bathrooms + Offers, data = house_prices)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -33608  -9889  -2968   9398  43243 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -17347.377  12724.896  -1.363    0.175    
## SqFt            61.840      8.264   7.483 1.20e-11 ***
## Bedrooms      9319.753   2148.754   4.337 2.97e-05 ***
## Bathrooms    12646.347   3109.662   4.067 8.45e-05 ***
## Offers      -13601.011   1324.819 -10.266  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15000 on 123 degrees of freedom
## Multiple R-squared:  0.6982, Adjusted R-squared:  0.6884 
## F-statistic: 71.13 on 4 and 123 DF,  p-value: < 2.2e-16

使用R Squared 評估

## lm
data("anscombe")
plot(y3 ~ x1, data = anscombe)
lmfit <- lm(y3 ~ x1, data = anscombe)
summary(lmfit)
## 
## Call:
## lm(formula = y3 ~ x1, data = anscombe)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1586 -0.6146 -0.2303  0.1540  3.2411 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)   3.0025     1.1245   2.670  0.02562 * 
## x1            0.4997     0.1179   4.239  0.00218 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.236 on 9 degrees of freedom
## Multiple R-squared:  0.6663, Adjusted R-squared:  0.6292 
## F-statistic: 17.97 on 1 and 9 DF,  p-value: 0.002176
abline(lmfit, coll ="red")
## Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): "coll"
## 不是一個繪圖參數

predicted <- predict(lmfit, newsdata = anscombe[,x1])
predicted
##        1        2        3        4        5        6        7        8 
## 7.999727 7.000273 9.498909 7.500000 8.499455 9.998636 6.000818 5.001364 
##        9       10       11 
## 8.999182 6.500545 5.501091
actual   <- anscombe[, 'y3']

rmse     <- (mean((actual - predicted)^ 2)) ^ 0.5
rmse
## [1] 1.118286
mu       <- mean(actual)
rse      <- mean((predicted - actual) ^ 2) / mean((mu - actual)^ 2)
rsquare  <- 1 - rse
rsquare
## [1] 0.666324
model1 <- list(
  rmse    = rmse,
  rse     = rse,
  rsquare = rsquare
)
model1
## $rmse
## [1] 1.118286
## 
## $rse
## [1] 0.333676
## 
## $rsquare
## [1] 0.666324
## rlm
data("anscombe")
plot(y3 ~ x1, data = anscombe)
rlmfit <- rlm(y3 ~ x1, data = anscombe) 
abline(rlmfit, col="red")

predicted <- predict(rlmfit, newsdata = anscombe[,x1])
predicted
##        1        2        3        4        5        6        7        8 
## 7.460722 6.769270 8.497899 7.114996 7.806448 8.843625 6.077819 5.386367 
##        9       10       11 
## 8.152173 6.423545 5.732093
actual   <- anscombe[, 'y3']

rmse     <- (mean((actual - predicted)^ 2)) ^ 0.5
rmse
## [1] 1.279045
mu       <- mean(actual)
rse      <- mean((predicted - actual) ^ 2) / mean((mu - actual)^ 2)
rsquare  <- 1 - rse
rsquare
## [1] 0.5634933
model2 <- list(
  rmse    = rmse,
  rse     = rse,
  rsquare = rsquare
)
model2
## $rmse
## [1] 1.279045
## 
## $rse
## [1] 0.4365067
## 
## $rsquare
## [1] 0.5634933
print('model1')
## [1] "model1"
print(model1)
## $rmse
## [1] 1.118286
## 
## $rse
## [1] 0.333676
## 
## $rsquare
## [1] 0.666324
print('model2')
## [1] "model2"
print(model2)
## $rmse
## [1] 1.279045
## 
## $rse
## [1] 0.4365067
## 
## $rsquare
## [1] 0.5634933

去除離群值

## lm
data("anscombe")
anscombe.subset <- anscombe[anscombe$x1 != 13, ]
plot(y3 ~ x1, data = anscombe.subset)
lmfit <- lm(y3 ~ x1, data = anscombe.subset)
summary(lmfit)
## 
## Call:
## lm(formula = y3 ~ x1, data = anscombe.subset)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0041558 -0.0022240  0.0000649  0.0018182  0.0050649 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 4.0056494  0.0029242    1370   <2e-16 ***
## x1          0.3453896  0.0003206    1077   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.003082 on 8 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 1.161e+06 on 1 and 8 DF,  p-value: < 2.2e-16
abline(lmfit, coll ="red")
## Warning in int_abline(a = a, b = b, h = h, v = v, untf = untf, ...): "coll"
## 不是一個繪圖參數

predicted <- predict(lmfit, newsdata = anscombe.subset[,x1])
predicted
##        1        2        4        5        6        7        8        9 
## 7.459545 6.768766 7.114156 7.804935 8.841104 6.077987 5.387208 8.150325 
##       10       11 
## 6.423377 5.732597
actual   <- anscombe.subset[, 'y3']

rmse     <- (mean((actual - predicted)^ 2)) ^ 0.5
rmse
## [1] 0.002756339
mu       <- mean(actual)
rse      <- mean((predicted - actual) ^ 2) / mean((mu - actual)^ 2)
rsquare  <- 1 - rse
rsquare
## [1] 0.9999931

Multiple Regression

house_prices <- read.csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/house-prices.csv')

head(house_prices)
##   Home  Price SqFt Bedrooms Bathrooms Offers Brick Neighborhood
## 1    1 114300 1790        2         2      2    No         East
## 2    2 114200 2030        4         2      3    No         East
## 3    3 114800 1740        3         2      1    No         East
## 4    4  94700 1980        3         2      3    No         East
## 5    5 119800 2130        3         3      3    No         East
## 6    6 114600 1780        3         2      2    No        North
#house_prices$Brick
house_prices$brick_d <- ifelse(house_prices$Brick == 'Yes', 1, 0)
house_prices$east    <- ifelse(house_prices$Neighborhood == 'East', 1, 0)
house_prices$north  <- ifelse(house_prices$Neighborhood == 'North', 1, 0)

house_prices$Brick        <- NULL
house_prices$Neighborhood <- NULL

head(house_prices)
##   Home  Price SqFt Bedrooms Bathrooms Offers brick_d east north
## 1    1 114300 1790        2         2      2       0    1     0
## 2    2 114200 2030        4         2      3       0    1     0
## 3    3 114800 1740        3         2      1       0    1     0
## 4    4  94700 1980        3         2      3       0    1     0
## 5    5 119800 2130        3         3      3       0    1     0
## 6    6 114600 1780        3         2      2       0    0     1
set.seed(110)
idx <- sample.int(2, nrow(house_prices), replace = TRUE, prob = c(0.6,0.4))
training_data   <- house_prices[idx == 1, ]
validation_data <- house_prices[idx == 2, ]

lm.fit1 <- lm(Price ~ SqFt + Bathrooms + Bedrooms + Offers + north + east+ brick_d, data = training_data)

summary(lm.fit1)
## 
## Call:
## lm(formula = Price ~ SqFt + Bathrooms + Bedrooms + Offers + north + 
##     east + brick_d, data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -29684.9  -7243.2    131.9   5442.4  26997.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  16196.683  15352.916   1.055  0.29512    
## SqFt            50.525      8.671   5.827 1.64e-07 ***
## Bathrooms    11323.358   2791.959   4.056  0.00013 ***
## Bedrooms      5279.107   2207.022   2.392  0.01949 *  
## Offers       -8172.431   1424.182  -5.738 2.34e-07 ***
## north       -20575.043   4111.119  -5.005 4.10e-06 ***
## east        -23222.284   3507.448  -6.621 6.43e-09 ***
## brick_d      15089.593   2618.395   5.763 2.12e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10470 on 69 degrees of freedom
## Multiple R-squared:  0.8412, Adjusted R-squared:  0.8251 
## F-statistic: 52.21 on 7 and 69 DF,  p-value: < 2.2e-16
nrow(training_data)
## [1] 77

AIC

lm.fit1.step <- stepAIC(lm.fit1)
## Start:  AIC=1433.07
## Price ~ SqFt + Bathrooms + Bedrooms + Offers + north + east + 
##     brick_d
## 
##             Df  Sum of Sq        RSS    AIC
## <none>                    7.5689e+09 1433.1
## - Bedrooms   1  627616327 8.1966e+09 1437.2
## - Bathrooms  1 1804342905 9.3733e+09 1447.5
## - north      1 2747564839 1.0317e+10 1454.9
## - Offers     1 3612086579 1.1181e+10 1461.1
## - brick_d    1 3643102656 1.1212e+10 1461.3
## - SqFt       1 3724914639 1.1294e+10 1461.9
## - east       1 4808548343 1.2377e+10 1468.9
summary(lm.fit1.step)
## 
## Call:
## lm(formula = Price ~ SqFt + Bathrooms + Bedrooms + Offers + north + 
##     east + brick_d, data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -29684.9  -7243.2    131.9   5442.4  26997.5 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  16196.683  15352.916   1.055  0.29512    
## SqFt            50.525      8.671   5.827 1.64e-07 ***
## Bathrooms    11323.358   2791.959   4.056  0.00013 ***
## Bedrooms      5279.107   2207.022   2.392  0.01949 *  
## Offers       -8172.431   1424.182  -5.738 2.34e-07 ***
## north       -20575.043   4111.119  -5.005 4.10e-06 ***
## east        -23222.284   3507.448  -6.621 6.43e-09 ***
## brick_d      15089.593   2618.395   5.763 2.12e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10470 on 69 degrees of freedom
## Multiple R-squared:  0.8412, Adjusted R-squared:  0.8251 
## F-statistic: 52.21 on 7 and 69 DF,  p-value: < 2.2e-16

Examine Residuals

training_data$predict.price <- predict(lm.fit1)
training_data$error         <- residuals(lm.fit1)
head(training_data)
##    Home  Price SqFt Bedrooms Bathrooms Offers brick_d east north
## 2     2 114200 2030        4         2      3       0    1     0
## 6     6 114600 1780        3         2      2       0    0     1
## 10   10 104000 1730        3         3      3       0    1     0
## 12   12 123000 1870        2         2      2       1    1     0
## 13   13 102600 1910        3         2      4       0    0     1
## 16   16 145800 1780        4         2      1       0    0     0
##    predict.price      error
## 2      114786.86  -586.8556
## 6      107696.07  6903.9343
## 10     105673.48 -1673.4802
## 12     119406.60  3593.4025
## 13      97919.51  4680.4921
## 16     141722.65  4077.3531
hist(training_data$error)

validation_data$predict.price <- predict(lm.fit1, 
                                         newdata = validation_data)
validation_data$error         <- validation_data$predict.price -  validation_data$Price

hist(validation_data$error)

## 驗證在591 上

library(httr)

dfall <- data.frame()
for(pagecnt in 0:24){
  res <- GET(paste0('https://rent.591.com.tw/home/search/rsList?is_new_list=1&type=1&kind=1&searchtype=1&region=1&section=5&firstRow=', pagecnt * 30 ,'&totalRows=711'))
  house <- content(res)
  print(pagecnt)

  for( item in house$data$data){
    if (!is.null(item$layout)){
    df <- data.frame(address = item$address, 
                     price   = item$price, 
                     area    = item$area,
                   layout    = item$layout,
                    floor    = item$floor,
                 allfloor    = item$allfloor
                  )
    dfall <- rbind(dfall, df)
    }
  }
}
head(dfall)
str(dfall)

591 資料處理

head(dfall)
dfall$price <- as.integer(sub(pattern=',', replacement = '',dfall$price))




dfall$bedroom <- gsub(pattern='(\\d+)房(\\d+)廳(\\d+)衛', '\\1', dfall$layout)
dfall$livingroom <- gsub(pattern='(\\d+)房(\\d+)廳(\\d+)衛', '\\2', dfall$layout)
dfall$bathroom   <- gsub(pattern='(\\d+)房(\\d+)廳(\\d+)衛', '\\3', dfall$layout)

dfall$bedroom    <- as.integer(dfall$bedroom)
dfall$bathroom   <- as.integer(dfall$bathroom)
dfall$livingroom <- as.integer(dfall$livingroom)

dfall <- dfall[dfall$floor < 50, ]
dfall <- dfall[dfall$bathroom != '開放式格局',]
dfall$layout <- NULL


write.csv(x =dfall, file = 'rent_591_regression.csv')

利用591 建立房價預估模型

rent <- read.csv('rent_591_regression.csv') 
head(rent)
##   X                    address  price area floor allfloor livingroom
## 1 1 新生南路二段正面大安森林.. 200000  151     2       15          2
## 2 2  復興南路一段219巷忠孝復..  48000   36     2        7          2
## 3 3       和平東路二段兩房一..  25009   25     2        4          1
## 4 4       光復南路光復南路仁..  22914   20     7       12          1
## 5 5         樂業街2房1廳基隆..  19906   20     2        4          1
## 6 7       信義路三段台北大安..  36000   14    12       14          2
##   bathroom bedroom
## 1        4       4
## 2        1       3
## 3        1       2
## 4        1       2
## 5        1       2
## 6        1       1
fit <- lm(price ~ area + floor + allfloor + livingroom + bathroom + bedroom, data = rent)

summary(fit)
## 
## Call:
## lm(formula = price ~ area + floor + allfloor + livingroom + bathroom + 
##     bedroom, data = rent)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -69228 -10669  -1390   7854 107682 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   866.00    3169.59   0.273  0.78476    
## area         1476.74      53.13  27.795  < 2e-16 ***
## floor         293.55     225.26   1.303  0.19295    
## allfloor      221.61     198.33   1.117  0.26421    
## livingroom   6264.27    1896.61   3.303  0.00101 ** 
## bathroom      272.16    1800.47   0.151  0.87989    
## bedroom     -6038.56    1079.81  -5.592 3.21e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19160 on 702 degrees of freedom
## Multiple R-squared:  0.7398, Adjusted R-squared:  0.7376 
## F-statistic: 332.7 on 6 and 702 DF,  p-value: < 2.2e-16
# for stepAIC
library(MASS)
fit.step1 <- stepAIC(fit)
## Start:  AIC=13988.97
## price ~ area + floor + allfloor + livingroom + bathroom + bedroom
## 
##              Df  Sum of Sq        RSS   AIC
## - bathroom    1 8.3844e+06 2.5761e+11 13987
## - allfloor    1 4.5816e+08 2.5806e+11 13988
## - floor       1 6.2314e+08 2.5822e+11 13989
## <none>                     2.5760e+11 13989
## - livingroom  1 4.0030e+09 2.6160e+11 13998
## - bedroom     1 1.1476e+10 2.6907e+11 14018
## - area        1 2.8350e+11 5.4109e+11 14513
## 
## Step:  AIC=13986.99
## price ~ area + floor + allfloor + livingroom + bedroom
## 
##              Df  Sum of Sq        RSS   AIC
## - allfloor    1 4.5560e+08 2.5806e+11 13986
## - floor       1 6.3527e+08 2.5824e+11 13987
## <none>                     2.5761e+11 13987
## - livingroom  1 4.3908e+09 2.6200e+11 13997
## - bedroom     1 1.2710e+10 2.7032e+11 14019
## - area        1 3.8118e+11 6.3878e+11 14629
## 
## Step:  AIC=13986.24
## price ~ area + floor + livingroom + bedroom
## 
##              Df  Sum of Sq        RSS   AIC
## <none>                     2.5806e+11 13986
## - floor       1 2.3614e+09 2.6042e+11 13991
## - livingroom  1 4.1510e+09 2.6221e+11 13996
## - bedroom     1 1.2793e+10 2.7086e+11 14018
## - area        1 4.0328e+11 6.6135e+11 14652
# Withohout MASS
fit.step <- step(fit)
## Start:  AIC=13988.97
## price ~ area + floor + allfloor + livingroom + bathroom + bedroom
## 
##              Df  Sum of Sq        RSS   AIC
## - bathroom    1 8.3844e+06 2.5761e+11 13987
## - allfloor    1 4.5816e+08 2.5806e+11 13988
## - floor       1 6.2314e+08 2.5822e+11 13989
## <none>                     2.5760e+11 13989
## - livingroom  1 4.0030e+09 2.6160e+11 13998
## - bedroom     1 1.1476e+10 2.6907e+11 14018
## - area        1 2.8350e+11 5.4109e+11 14513
## 
## Step:  AIC=13986.99
## price ~ area + floor + allfloor + livingroom + bedroom
## 
##              Df  Sum of Sq        RSS   AIC
## - allfloor    1 4.5560e+08 2.5806e+11 13986
## - floor       1 6.3527e+08 2.5824e+11 13987
## <none>                     2.5761e+11 13987
## - livingroom  1 4.3908e+09 2.6200e+11 13997
## - bedroom     1 1.2710e+10 2.7032e+11 14019
## - area        1 3.8118e+11 6.3878e+11 14629
## 
## Step:  AIC=13986.24
## price ~ area + floor + livingroom + bedroom
## 
##              Df  Sum of Sq        RSS   AIC
## <none>                     2.5806e+11 13986
## - floor       1 2.3614e+09 2.6042e+11 13991
## - livingroom  1 4.1510e+09 2.6221e+11 13996
## - bedroom     1 1.2793e+10 2.7086e+11 14018
## - area        1 4.0328e+11 6.6135e+11 14652
summary(fit.step)
## 
## Call:
## lm(formula = price ~ area + floor + livingroom + bedroom, data = rent)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -69514 -10387  -1593   8442 107176 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2297.60    2900.99   0.792 0.428625    
## area         1491.20      44.96  33.169  < 2e-16 ***
## floor         449.74     177.20   2.538 0.011361 *  
## livingroom   6130.93    1821.90   3.365 0.000807 ***
## bedroom     -6002.33    1016.02  -5.908  5.4e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19150 on 704 degrees of freedom
## Multiple R-squared:  0.7394, Adjusted R-squared:  0.7379 
## F-statistic: 499.3 on 4 and 704 DF,  p-value: < 2.2e-16
predict(fit.step, data.frame(area = 20, floor= 3, livingroom = 1, bedroom= 2))
##        1 
## 27597.11

GLM

?glm
## starting httpd help server ... done
rent <- read.csv('rent_591_regression.csv') 

fit <- glm(price ~ area + floor + allfloor + livingroom + bathroom + bedroom, data = rent, family = 'gaussian')

summary(fit)
## 
## Call:
## glm(formula = price ~ area + floor + allfloor + livingroom + 
##     bathroom + bedroom, family = "gaussian", data = rent)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -69228  -10669   -1390    7854  107682  
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   866.00    3169.59   0.273  0.78476    
## area         1476.74      53.13  27.795  < 2e-16 ***
## floor         293.55     225.26   1.303  0.19295    
## allfloor      221.61     198.33   1.117  0.26421    
## livingroom   6264.27    1896.61   3.303  0.00101 ** 
## bathroom      272.16    1800.47   0.151  0.87989    
## bedroom     -6038.56    1079.81  -5.592 3.21e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 366949358)
## 
##     Null deviance: 9.9015e+11  on 708  degrees of freedom
## Residual deviance: 2.5760e+11  on 702  degrees of freedom
## AIC: 16003
## 
## Number of Fisher Scoring iterations: 2
fit.step <- step(fit)
## Start:  AIC=16003.02
## price ~ area + floor + allfloor + livingroom + bathroom + bedroom
## 
##              Df   Deviance   AIC
## - bathroom    1 2.5761e+11 16001
## - allfloor    1 2.5806e+11 16002
## - floor       1 2.5822e+11 16003
## <none>          2.5760e+11 16003
## - livingroom  1 2.6160e+11 16012
## - bedroom     1 2.6907e+11 16032
## - area        1 5.4109e+11 16527
## 
## Step:  AIC=16001.04
## price ~ area + floor + allfloor + livingroom + bedroom
## 
##              Df   Deviance   AIC
## - allfloor    1 2.5806e+11 16000
## - floor       1 2.5824e+11 16001
## <none>          2.5761e+11 16001
## - livingroom  1 2.6200e+11 16011
## - bedroom     1 2.7032e+11 16033
## - area        1 6.3878e+11 16643
## 
## Step:  AIC=16000.3
## price ~ area + floor + livingroom + bedroom
## 
##              Df   Deviance   AIC
## <none>          2.5806e+11 16000
## - floor       1 2.6042e+11 16005
## - livingroom  1 2.6221e+11 16010
## - bedroom     1 2.7086e+11 16033
## - area        1 6.6135e+11 16666

Logistic Regression

data(iris)

dataset <- iris[51:150, ]

dataset$Species <- factor(dataset$Species)

fit <- glm(Species ~., data = dataset, family = binomial(link = "logit"))

str(dataset)
## 'data.frame':    100 obs. of  5 variables:
##  $ Sepal.Length: num  7 6.4 6.9 5.5 6.5 5.7 6.3 4.9 6.6 5.2 ...
##  $ Sepal.Width : num  3.2 3.2 3.1 2.3 2.8 2.8 3.3 2.4 2.9 2.7 ...
##  $ Petal.Length: num  4.7 4.5 4.9 4 4.6 4.5 4.7 3.3 4.6 3.9 ...
##  $ Petal.Width : num  1.4 1.5 1.5 1.3 1.5 1.3 1.6 1 1.3 1.4 ...
##  $ Species     : Factor w/ 2 levels "versicolor","virginica": 1 1 1 1 1 1 1 1 1 1 ...
summary(fit)
## 
## Call:
## glm(formula = Species ~ ., family = binomial(link = "logit"), 
##     data = dataset)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.01105  -0.00541  -0.00001   0.00677   1.78065  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   -42.638     25.707  -1.659   0.0972 .
## Sepal.Length   -2.465      2.394  -1.030   0.3032  
## Sepal.Width    -6.681      4.480  -1.491   0.1359  
## Petal.Length    9.429      4.737   1.991   0.0465 *
## Petal.Width    18.286      9.743   1.877   0.0605 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 138.629  on 99  degrees of freedom
## Residual deviance:  11.899  on 95  degrees of freedom
## AIC: 21.899
## 
## Number of Fisher Scoring iterations: 10
fit.step <- step(fit)
## Start:  AIC=21.9
## Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                Df Deviance    AIC
## - Sepal.Length  1   13.266 21.266
## <none>              11.899 21.899
## - Sepal.Width   1   15.492 23.492
## - Petal.Width   1   23.772 31.772
## - Petal.Length  1   25.902 33.902
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=21.27
## Species ~ Sepal.Width + Petal.Length + Petal.Width
## 
##                Df Deviance    AIC
## <none>              13.266 21.266
## - Sepal.Width   1   20.564 26.564
## - Petal.Length  1   27.399 33.399
## - Petal.Width   1   31.512 37.512