Regression
X <- c(17, 21, 35, 39, 50, 65)
Y <- c(132, 150, 160, 162, 149,170)
X_avg <- mean(X)
Y_avg <- mean(Y)
slope <- (sum(X * Y) - length(X) * X_avg * Y_avg) / (sum(X ^ 2) - length(X) * X_avg ^ 2)
intercept <- Y_avg - slope * X_avg
# Y = 0.5529606 X + 132.913
plot(X,Y, xlim = c(0,80), ylim = c(0,200))
abline(a = intercept, b=slope, col='red')

?abline
## starting httpd help server ... done
data(anscombe)
View(anscombe)
plot(y1 ~ x1, data = anscombe )

fit <- lm(y1~ x1, data = anscombe)
fit
##
## Call:
## lm(formula = y1 ~ x1, data = anscombe)
##
## Coefficients:
## (Intercept) x1
## 3.0001 0.5001
plot(y1 ~ x1, data = anscombe, xlim=c(0,20),ylim=c(0,20) )
abline(fit, col = 'red')
predicted <- predict(fit, data.frame(x1 = 16) )
abline(v = 16, col ='blue', lty = 2)
abline(h = predicted, col ='blue', lty = 2)

fit2 <- glm(y1~ x1, data = anscombe, family = 'gaussian')
fit2
##
## Call: glm(formula = y1 ~ x1, family = "gaussian", data = anscombe)
##
## Coefficients:
## (Intercept) x1
## 3.0001 0.5001
##
## Degrees of Freedom: 10 Total (i.e. Null); 9 Residual
## Null Deviance: 41.27
## Residual Deviance: 13.76 AIC: 39.68
plot(y2 ~ x1, data = anscombe, col= 'red',pch= 19 )
points(y1 ~x1, data = anscombe, col= 'blue',pch= 19 )
fit <- lm(y2 ~ x1, data = anscombe)
abline(fit, col = 'green')

# y = ax + b => 0 convex
# y = ax^2 + bx + c => 1 convex
# y = ax^3 + bx^2 + cx + d => 2 convex
# y = x^2
x <- seq(-3,3, 0.01)
#x
plot(x, x^2)

y = x^3 + 3 * x^2 + 10
plot(x, y)

plot(y2 ~ x1, data = anscombe, col= 'red',pch= 19 )
fit <- lm(y2 ~ poly(x1,2), data = anscombe )
fit
##
## Call:
## lm(formula = y2 ~ poly(x1, 2), data = anscombe)
##
## Coefficients:
## (Intercept) poly(x1, 2)1 poly(x1, 2)2
## 7.501 5.244 -3.712
# y = -3.712 x ^ 2 + 5.244 * x + 7.501
plot(y2 ~ x1, data = anscombe, col= 'red',pch= 19 )
lines(y2 ~ x1, data = anscombe,col = 'red')

anscombe$x1
## [1] 10 8 13 9 11 14 6 4 12 7 5
order(anscombe$x1)
## [1] 8 11 7 10 2 4 1 5 9 3 6
plot(y2 ~ x1, data = anscombe, col= 'red',pch= 19 )
lines(y2 ~ x1, data =anscombe[order(anscombe$x1),],col = 'red')

predicted <- predict(fit, data.frame(x1=anscombe$x1))
predicted[order(anscombe$x1)]
## 8 11 7 10 2 4 1 5
## 3.100210 4.740629 6.127622 7.261189 8.141329 8.768042 9.141329 9.261189
## 9 3 6
## 9.127622 8.740629 8.100210
anscombe$x1[order(anscombe$x1)]
## [1] 4 5 6 7 8 9 10 11 12 13 14
plot(y2 ~ x1, data = anscombe, col= 'red',pch= 19 )
lines(anscombe$x1[order(anscombe$x1)],predicted[order(anscombe$x1)] ,col = 'red')

plot(y3 ~ x1, data = anscombe, col= 'red',pch= 19 )
fit <- lm(y3 ~ x1, data = anscombe)
abline(fit, col='blue')

library(MASS)
## Warning: package 'MASS' was built under R version 3.4.4
plot(y3 ~ x1, data = anscombe, col= 'red',pch= 19 )
rlmfit <- rlm(y3 ~ x1, data = anscombe)
abline(rlmfit, col='blue')

house_prices <- read.csv('https://raw.githubusercontent.com/ywchiu/fubonr/master/data/house-prices.csv')
names(house_prices)
## [1] "Home" "Price" "SqFt" "Bedrooms"
## [5] "Bathrooms" "Offers" "Brick" "Neighborhood"
str(house_prices)
## 'data.frame': 128 obs. of 8 variables:
## $ Home : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Price : int 114300 114200 114800 94700 119800 114600 151600 150700 119200 104000 ...
## $ SqFt : int 1790 2030 1740 1980 2130 1780 1830 2160 2110 1730 ...
## $ Bedrooms : int 2 4 3 3 3 3 3 4 4 3 ...
## $ Bathrooms : int 2 2 2 2 3 2 3 2 2 3 ...
## $ Offers : int 2 3 1 3 3 2 3 2 3 3 ...
## $ Brick : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 2 1 1 1 ...
## $ Neighborhood: Factor w/ 3 levels "East","North",..: 1 1 1 1 1 2 3 3 1 1 ...
plot(Price ~ SqFt, data = house_prices, xlim=c(0,3000), ylim = c(0,200000))
fit <- lm(Price ~ SqFt, data = house_prices)
abline(fit, col = 'red')

predict(fit, data.frame(SqFt= 3000))
## 1
## 200587.8
predicted <- predict(fit, house_prices)
residual <- predicted - house_prices$Price
hist(residual)

plot(x = predicted, y = residual, ylim=c(-60000, 60000))
abline(h = 50000, col='red')
abline(h = -50000, col='red')

max(residual)
## [1] 46593.19
min(residual)
## [1] -54829.44
summary(fit)
##
## Call:
## lm(formula = Price ~ SqFt, data = house_prices)
##
## Residuals:
## Min 1Q Median 3Q Max
## -46593 -16644 -1610 15124 54829
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10091.130 18966.104 -0.532 0.596
## SqFt 70.226 9.426 7.450 1.3e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22480 on 126 degrees of freedom
## Multiple R-squared: 0.3058, Adjusted R-squared: 0.3003
## F-statistic: 55.5 on 1 and 126 DF, p-value: 1.302e-11
Multiple Regression
names(house_prices)
## [1] "Home" "Price" "SqFt" "Bedrooms"
## [5] "Bathrooms" "Offers" "Brick" "Neighborhood"
head(house_prices)
## Home Price SqFt Bedrooms Bathrooms Offers Brick Neighborhood
## 1 1 114300 1790 2 2 2 No East
## 2 2 114200 2030 4 2 3 No East
## 3 3 114800 1740 3 2 1 No East
## 4 4 94700 1980 3 2 3 No East
## 5 5 119800 2130 3 3 3 No East
## 6 6 114600 1780 3 2 2 No North
house_prices$brick_d <- ifelse(house_prices$Brick == 'Yes', 1, 0)
house_prices$Neighborhood
## [1] East East East East East North West West East East East
## [12] East North North West West East North East West East North
## [23] North North East North West East North West West North East
## [34] East North North North West West North East West East East
## [45] West East North North East North East North North North North
## [56] East East West West West West North West East West North
## [67] North East North West West West North East West North West
## [78] West North West East West West East North West North West
## [89] North North West East West East West West East East West
## [100] West North East East West East West North East East East
## [111] North North East North East North West North North North North
## [122] East East East East North West North
## Levels: East North West
house_prices$east<- ifelse(house_prices$Neighborhood=="East",1,0)
house_prices$north<-
ifelse(house_prices$Neighborhood=="North",1,0)
house_prices$Neighborhood <- NULL
#house_prices
set.seed(123)
idx <- sample.int(2, nrow(house_prices), replace = TRUE, prob= c(0.6, 0.4))
training_data <- house_prices[idx==1,]
validation_data <- house_prices[idx==2,]
lm.fit1 <- lm(Price ~ SqFt+Bathrooms+Bedrooms+Offers+
north+east+brick_d, data=training_data)
summary(lm.fit1)
##
## Call:
## lm(formula = Price ~ SqFt + Bathrooms + Bedrooms + Offers + north +
## east + brick_d, data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25491.4 -5131.2 -490.5 6769.7 20369.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13252.14 12531.98 1.057 0.2939
## SqFt 61.48 6.89 8.923 3.66e-13 ***
## Bathrooms 3266.08 2487.80 1.313 0.1935
## Bedrooms 4823.22 2015.58 2.393 0.0194 *
## Offers -8331.03 1260.22 -6.611 6.35e-09 ***
## north -17452.11 3911.56 -4.462 3.03e-05 ***
## east -20680.68 3159.79 -6.545 8.35e-09 ***
## brick_d 19541.81 2412.24 8.101 1.19e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9146 on 70 degrees of freedom
## Multiple R-squared: 0.8922, Adjusted R-squared: 0.8814
## F-statistic: 82.74 on 7 and 70 DF, p-value: < 2.2e-16
lm.fit2 <- lm(Price ~ SqFt, data=training_data)
summary(lm.fit2)
##
## Call:
## lm(formula = Price ~ SqFt, data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -45987 -14901 -737 14958 54772
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -15323.92 23496.09 -0.652 0.516
## SqFt 72.82 11.85 6.146 3.4e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21850 on 76 degrees of freedom
## Multiple R-squared: 0.332, Adjusted R-squared: 0.3232
## F-statistic: 37.77 on 1 and 76 DF, p-value: 3.397e-08
summary(lm.fit1)
##
## Call:
## lm(formula = Price ~ SqFt + Bathrooms + Bedrooms + Offers + north +
## east + brick_d, data = training_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -25491.4 -5131.2 -490.5 6769.7 20369.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13252.14 12531.98 1.057 0.2939
## SqFt 61.48 6.89 8.923 3.66e-13 ***
## Bathrooms 3266.08 2487.80 1.313 0.1935
## Bedrooms 4823.22 2015.58 2.393 0.0194 *
## Offers -8331.03 1260.22 -6.611 6.35e-09 ***
## north -17452.11 3911.56 -4.462 3.03e-05 ***
## east -20680.68 3159.79 -6.545 8.35e-09 ***
## brick_d 19541.81 2412.24 8.101 1.19e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 9146 on 70 degrees of freedom
## Multiple R-squared: 0.8922, Adjusted R-squared: 0.8814
## F-statistic: 82.74 on 7 and 70 DF, p-value: < 2.2e-16
lm.fit1.step <- step(lm.fit1)
## Start: AIC=1430.44
## Price ~ SqFt + Bathrooms + Bedrooms + Offers + north + east +
## brick_d
##
## Df Sum of Sq RSS AIC
## - Bathrooms 1 144167994 5.9994e+09 1430.3
## <none> 5.8552e+09 1430.4
## - Bedrooms 1 478980711 6.3342e+09 1434.6
## - north 1 1665108832 7.5203e+09 1448.0
## - east 1 3583097368 9.4383e+09 1465.7
## - Offers 1 3655512174 9.5107e+09 1466.3
## - brick_d 1 5489527535 1.1345e+10 1480.0
## - SqFt 1 6659891685 1.2515e+10 1487.7
##
## Step: AIC=1430.34
## Price ~ SqFt + Bedrooms + Offers + north + east + brick_d
##
## Df Sum of Sq RSS AIC
## <none> 5.9994e+09 1430.3
## - Bedrooms 1 692383542 6.6918e+09 1436.9
## - north 1 1714508808 7.7139e+09 1448.0
## - Offers 1 3533703830 9.5331e+09 1464.5
## - east 1 3667946556 9.6673e+09 1465.5
## - brick_d 1 6085356765 1.2085e+10 1483.0
## - SqFt 1 7460095606 1.3459e+10 1491.4
lm.fit1.step
##
## Call:
## lm(formula = Price ~ SqFt + Bedrooms + Offers + north + east +
## brick_d, data = training_data)
##
## Coefficients:
## (Intercept) SqFt Bedrooms Offers north
## 14481.33 63.47 5565.89 -8130.91 -17690.07
## east brick_d
## -20895.93 20168.34
training_data$predict.price <- predict(lm.fit1)
#training_data
training_data$error <- residuals(lm.fit1)
validation_data$predict.price <- predict(lm.fit1,newdata=validation_data)
validation_data$error <- validation_data$predict.price - validation_data$Price
hist(training_data$error)

hist(validation_data$error)

a<-cor(training_data$Price,training_data$predict.price)
a^2
## [1] 0.8921676
b<-cor(validation_data$Price,validation_data$predict.price)
b^2
## [1] 0.8097617