Regression

X <- c(17, 21, 35, 39, 50, 65)
Y <- c(132, 150, 160, 162, 149,170)
X_avg <- mean(X)
Y_avg <- mean(Y)

slope <- (sum(X * Y) - length(X) * X_avg * Y_avg) / (sum(X ^ 2) - length(X) * X_avg ^ 2)

intercept <- Y_avg - slope * X_avg


# Y = 0.5529606 X + 132.913
plot(X,Y, xlim = c(0,80), ylim = c(0,200))
abline(a = intercept, b=slope, col='red')

?abline
## starting httpd help server ... done
data(anscombe)

View(anscombe)

plot(y1 ~ x1, data = anscombe )

fit <- lm(y1~ x1, data = anscombe)

fit
## 
## Call:
## lm(formula = y1 ~ x1, data = anscombe)
## 
## Coefficients:
## (Intercept)           x1  
##      3.0001       0.5001
plot(y1 ~ x1, data = anscombe, xlim=c(0,20),ylim=c(0,20) )
abline(fit, col = 'red')
predicted <- predict(fit,  data.frame(x1 = 16) )
abline(v = 16, col ='blue', lty = 2)
abline(h = predicted, col ='blue', lty = 2)

fit2 <- glm(y1~ x1, data = anscombe, family = 'gaussian')

fit2
## 
## Call:  glm(formula = y1 ~ x1, family = "gaussian", data = anscombe)
## 
## Coefficients:
## (Intercept)           x1  
##      3.0001       0.5001  
## 
## Degrees of Freedom: 10 Total (i.e. Null);  9 Residual
## Null Deviance:       41.27 
## Residual Deviance: 13.76     AIC: 39.68
plot(y2 ~ x1, data = anscombe, col= 'red',pch= 19 )
points(y1 ~x1, data = anscombe, col= 'blue',pch= 19 )
fit <- lm(y2 ~ x1, data = anscombe)
abline(fit, col = 'green')

# y = ax + b               => 0 convex
# y = ax^2 + bx   + c      => 1 convex
# y = ax^3 + bx^2 + cx + d => 2 convex

# y = x^2
x <- seq(-3,3, 0.01)
#x
plot(x, x^2)

y = x^3 + 3 * x^2 + 10
plot(x, y)

plot(y2 ~ x1, data = anscombe, col= 'red',pch= 19 )
fit <- lm(y2 ~ poly(x1,2), data = anscombe )
fit
## 
## Call:
## lm(formula = y2 ~ poly(x1, 2), data = anscombe)
## 
## Coefficients:
##  (Intercept)  poly(x1, 2)1  poly(x1, 2)2  
##        7.501         5.244        -3.712
# y = -3.712 x ^ 2 + 5.244 * x + 7.501

plot(y2 ~ x1, data = anscombe, col= 'red',pch= 19 )
lines(y2 ~ x1, data = anscombe,col = 'red')

anscombe$x1
##  [1] 10  8 13  9 11 14  6  4 12  7  5
order(anscombe$x1)
##  [1]  8 11  7 10  2  4  1  5  9  3  6
plot(y2 ~ x1, data = anscombe, col= 'red',pch= 19 )
lines(y2 ~ x1, data =anscombe[order(anscombe$x1),],col = 'red')

predicted <- predict(fit, data.frame(x1=anscombe$x1))


predicted[order(anscombe$x1)]
##        8       11        7       10        2        4        1        5 
## 3.100210 4.740629 6.127622 7.261189 8.141329 8.768042 9.141329 9.261189 
##        9        3        6 
## 9.127622 8.740629 8.100210
anscombe$x1[order(anscombe$x1)]
##  [1]  4  5  6  7  8  9 10 11 12 13 14
plot(y2 ~ x1, data = anscombe, col= 'red',pch= 19 )
lines(anscombe$x1[order(anscombe$x1)],predicted[order(anscombe$x1)] ,col = 'red')

plot(y3 ~ x1, data = anscombe, col= 'red',pch= 19 )
fit <- lm(y3 ~ x1, data = anscombe)
abline(fit, col='blue')

library(MASS)
## Warning: package 'MASS' was built under R version 3.4.4
plot(y3 ~ x1, data = anscombe, col= 'red',pch= 19 )
rlmfit <- rlm(y3 ~ x1, data = anscombe)
abline(rlmfit, col='blue')

house_prices <- read.csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/house-prices.csv')

names(house_prices)
## [1] "Home"         "Price"        "SqFt"         "Bedrooms"    
## [5] "Bathrooms"    "Offers"       "Brick"        "Neighborhood"
str(house_prices)
## 'data.frame':    128 obs. of  8 variables:
##  $ Home        : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Price       : int  114300 114200 114800 94700 119800 114600 151600 150700 119200 104000 ...
##  $ SqFt        : int  1790 2030 1740 1980 2130 1780 1830 2160 2110 1730 ...
##  $ Bedrooms    : int  2 4 3 3 3 3 3 4 4 3 ...
##  $ Bathrooms   : int  2 2 2 2 3 2 3 2 2 3 ...
##  $ Offers      : int  2 3 1 3 3 2 3 2 3 3 ...
##  $ Brick       : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 1 ...
##  $ Neighborhood: Factor w/ 3 levels "East","North",..: 1 1 1 1 1 2 3 3 1 1 ...
plot(Price ~ SqFt, data = house_prices, xlim=c(0,3000), ylim = c(0,200000))

fit <- lm(Price ~ SqFt, data = house_prices)
abline(fit, col = 'red')

predict(fit, data.frame(SqFt= 3000))
##        1 
## 200587.8
predicted <- predict(fit, house_prices)
residual <- predicted - house_prices$Price

hist(residual)

plot(x = predicted, y = residual, ylim=c(-60000, 60000))
abline(h = 50000, col='red')
abline(h = -50000, col='red')

max(residual)
## [1] 46593.19
min(residual)
## [1] -54829.44
summary(fit)
## 
## Call:
## lm(formula = Price ~ SqFt, data = house_prices)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -46593 -16644  -1610  15124  54829 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -10091.130  18966.104  -0.532    0.596    
## SqFt            70.226      9.426   7.450  1.3e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22480 on 126 degrees of freedom
## Multiple R-squared:  0.3058, Adjusted R-squared:  0.3003 
## F-statistic:  55.5 on 1 and 126 DF,  p-value: 1.302e-11

Evaluate Regression Model

plot(Price ~ SqFt, data = house_prices)
lmfit <- lm(Price ~ SqFt, data = house_prices)
abline(lmfit, col="red")

predicted <- predict(lmfit, newdata=house_prices[c("SqFt")])

actual <- house_prices$Price
rmse   <- (mean((predicted - actual)^2))^0.5
rmse
## [1] 22299.25
mu     <- mean(actual)
# RSE = SSE / SST
rse    <- mean((predicted - actual)^2) / mean((mu - actual)^2)
rsquare <- 1 - rse
rsquare
## [1] 0.3057894

Multiple Regression

names(house_prices)
## [1] "Home"         "Price"        "SqFt"         "Bedrooms"    
## [5] "Bathrooms"    "Offers"       "Brick"        "Neighborhood"
head(house_prices)
##   Home  Price SqFt Bedrooms Bathrooms Offers Brick Neighborhood
## 1    1 114300 1790        2         2      2    No         East
## 2    2 114200 2030        4         2      3    No         East
## 3    3 114800 1740        3         2      1    No         East
## 4    4  94700 1980        3         2      3    No         East
## 5    5 119800 2130        3         3      3    No         East
## 6    6 114600 1780        3         2      2    No        North
house_prices$brick_d <- ifelse(house_prices$Brick == 'Yes', 1, 0)

house_prices$Neighborhood
##   [1] East  East  East  East  East  North West  West  East  East  East 
##  [12] East  North North West  West  East  North East  West  East  North
##  [23] North North East  North West  East  North West  West  North East 
##  [34] East  North North North West  West  North East  West  East  East 
##  [45] West  East  North North East  North East  North North North North
##  [56] East  East  West  West  West  West  North West  East  West  North
##  [67] North East  North West  West  West  North East  West  North West 
##  [78] West  North West  East  West  West  East  North West  North West 
##  [89] North North West  East  West  East  West  West  East  East  West 
## [100] West  North East  East  West  East  West  North East  East  East 
## [111] North North East  North East  North West  North North North North
## [122] East  East  East  East  North West  North
## Levels: East North West
house_prices$east<- ifelse(house_prices$Neighborhood=="East",1,0)
house_prices$north<-
ifelse(house_prices$Neighborhood=="North",1,0)

house_prices$Neighborhood <- NULL

#house_prices
set.seed(123)

idx <- sample.int(2, nrow(house_prices), replace = TRUE, prob= c(0.6, 0.4))
training_data  <- house_prices[idx==1,]
validation_data <- house_prices[idx==2,]

lm.fit1 <- lm(Price ~ SqFt+Bathrooms+Bedrooms+Offers+
north+east+brick_d, data=training_data)

summary(lm.fit1)
## 
## Call:
## lm(formula = Price ~ SqFt + Bathrooms + Bedrooms + Offers + north + 
##     east + brick_d, data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25491.4  -5131.2   -490.5   6769.7  20369.0 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  13252.14   12531.98   1.057   0.2939    
## SqFt            61.48       6.89   8.923 3.66e-13 ***
## Bathrooms     3266.08    2487.80   1.313   0.1935    
## Bedrooms      4823.22    2015.58   2.393   0.0194 *  
## Offers       -8331.03    1260.22  -6.611 6.35e-09 ***
## north       -17452.11    3911.56  -4.462 3.03e-05 ***
## east        -20680.68    3159.79  -6.545 8.35e-09 ***
## brick_d      19541.81    2412.24   8.101 1.19e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9146 on 70 degrees of freedom
## Multiple R-squared:  0.8922, Adjusted R-squared:  0.8814 
## F-statistic: 82.74 on 7 and 70 DF,  p-value: < 2.2e-16
lm.fit2 <- lm(Price ~ SqFt, data=training_data)
summary(lm.fit2)
## 
## Call:
## lm(formula = Price ~ SqFt, data = training_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -45987 -14901   -737  14958  54772 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -15323.92   23496.09  -0.652    0.516    
## SqFt            72.82      11.85   6.146  3.4e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21850 on 76 degrees of freedom
## Multiple R-squared:  0.332,  Adjusted R-squared:  0.3232 
## F-statistic: 37.77 on 1 and 76 DF,  p-value: 3.397e-08
summary(lm.fit1)
## 
## Call:
## lm(formula = Price ~ SqFt + Bathrooms + Bedrooms + Offers + north + 
##     east + brick_d, data = training_data)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -25491.4  -5131.2   -490.5   6769.7  20369.0 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  13252.14   12531.98   1.057   0.2939    
## SqFt            61.48       6.89   8.923 3.66e-13 ***
## Bathrooms     3266.08    2487.80   1.313   0.1935    
## Bedrooms      4823.22    2015.58   2.393   0.0194 *  
## Offers       -8331.03    1260.22  -6.611 6.35e-09 ***
## north       -17452.11    3911.56  -4.462 3.03e-05 ***
## east        -20680.68    3159.79  -6.545 8.35e-09 ***
## brick_d      19541.81    2412.24   8.101 1.19e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9146 on 70 degrees of freedom
## Multiple R-squared:  0.8922, Adjusted R-squared:  0.8814 
## F-statistic: 82.74 on 7 and 70 DF,  p-value: < 2.2e-16
lm.fit1.step <- step(lm.fit1)
## Start:  AIC=1430.44
## Price ~ SqFt + Bathrooms + Bedrooms + Offers + north + east + 
##     brick_d
## 
##             Df  Sum of Sq        RSS    AIC
## - Bathrooms  1  144167994 5.9994e+09 1430.3
## <none>                    5.8552e+09 1430.4
## - Bedrooms   1  478980711 6.3342e+09 1434.6
## - north      1 1665108832 7.5203e+09 1448.0
## - east       1 3583097368 9.4383e+09 1465.7
## - Offers     1 3655512174 9.5107e+09 1466.3
## - brick_d    1 5489527535 1.1345e+10 1480.0
## - SqFt       1 6659891685 1.2515e+10 1487.7
## 
## Step:  AIC=1430.34
## Price ~ SqFt + Bedrooms + Offers + north + east + brick_d
## 
##            Df  Sum of Sq        RSS    AIC
## <none>                   5.9994e+09 1430.3
## - Bedrooms  1  692383542 6.6918e+09 1436.9
## - north     1 1714508808 7.7139e+09 1448.0
## - Offers    1 3533703830 9.5331e+09 1464.5
## - east      1 3667946556 9.6673e+09 1465.5
## - brick_d   1 6085356765 1.2085e+10 1483.0
## - SqFt      1 7460095606 1.3459e+10 1491.4
lm.fit1.step
## 
## Call:
## lm(formula = Price ~ SqFt + Bedrooms + Offers + north + east + 
##     brick_d, data = training_data)
## 
## Coefficients:
## (Intercept)         SqFt     Bedrooms       Offers        north  
##    14481.33        63.47      5565.89     -8130.91    -17690.07  
##        east      brick_d  
##   -20895.93     20168.34
training_data$predict.price <- predict(lm.fit1)
#training_data
training_data$error <- residuals(lm.fit1)


validation_data$predict.price <- predict(lm.fit1,newdata=validation_data)

validation_data$error <- validation_data$predict.price - validation_data$Price

hist(training_data$error)

hist(validation_data$error)

a<-cor(training_data$Price,training_data$predict.price)
a^2
## [1] 0.8921676
b<-cor(validation_data$Price,validation_data$predict.price)
b^2
## [1] 0.8097617