Regression Model

x <- c(17,21,35,39,50,65)
y <- c(132, 150, 160, 162, 149, 170 )
plot(x, y, ylim = c(0,200), xlim = c(0,100))
x_avg <- mean(x)
y_avg <- mean(y)

B1 <- (sum(x*y ) - 6 * x_avg * y_avg) / (sum(x ^ 2) - 6 *x_avg ^ 2)
B1
## [1] 0.5529606
B0 <- y_avg - B1 * x_avg

B0
## [1] 132.913
B1
## [1] 0.5529606
plot(x, y, ylim = c(0,200), xlim = c(0,100))
fit <- lm(y ~ x)
fit
## 
## Call:
## lm(formula = y ~ x)
## 
## Coefficients:
## (Intercept)            x  
##     132.913        0.553
plot(x, y, ylim = c(0,200), xlim = c(0,100))
abline(fit, col = 'red')

Anscombe dataset

data("anscombe")
View(anscombe)

fit  <- lm(y1~ x1, data = anscombe)

plot(y1 ~ x1, data = anscombe)
abline(fit, col = "red")

plot(y2 ~ x1, data = anscombe)
fit  <- lm(y2~ x1, data = anscombe)
abline(fit, col="red")

fit2  <- lm(y2~ poly(x1, 2), data = anscombe)
fit2
## 
## Call:
## lm(formula = y2 ~ poly(x1, 2), data = anscombe)
## 
## Coefficients:
##  (Intercept)  poly(x1, 2)1  poly(x1, 2)2  
##        7.501         5.244        -3.712
plot(y2 ~ x1, data = anscombe)
lines(sort(anscombe$x1), fit2$fitted.values[order(anscombe$x1)], col="red")

fit <- lm(y3 ~ x1, data = anscombe)
plot(y3 ~ x1, data = anscombe)
abline(fit, col="red")

library(MASS)
## Warning: package 'MASS' was built under R version 3.4.3
fit <- rlm(y3 ~ x1, data = anscombe)
plot(y3 ~ x1, data = anscombe)
abline(fit, col="red")

台北市房價分析

library(readr)
lvr_price <- read_csv('https://raw.githubusercontent.com/ywchiu/fuboni/master/data/lvr_prices.csv')
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   X1 = col_integer(),
##   land_sqmeter = col_double(),
##   trading_ymd = col_date(format = ""),
##   finish_ymd = col_date(format = ""),
##   building_sqmeter = col_double(),
##   room = col_integer(),
##   living_room = col_integer(),
##   bath = col_integer(),
##   total_price = col_integer(),
##   price_per_sqmeter = col_double(),
##   parking_sqmeter = col_double(),
##   parking_price = col_integer()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 32 parsing failures.
## row # A tibble: 5 x 5 col     row         col   expected     actual expected   <int>       <chr>      <chr>      <chr> actual 1  1282 total_price an integer 6700000000 file 2  2243 total_price an integer 3882685600 row 3  2244 total_price an integer 3373314400 col 4  4629 total_price an integer 3050000000 expected 5  5890 total_price an integer 3133800000 actual # ... with 1 more variables: file <chr>
## ... ................. ... ......................................... ........ ......................................... ...... ......................................... .... ......................................... ... ......................................... ... ......................................... ........ ......................................... ...... .......................................
## See problems(...) for more details.
head(lvr_price)
## # A tibble: 6 x 29
##      X1   area  trading_target                               address
##   <int>  <chr>           <chr>                                 <chr>
## 1     0 大安區 房地(土地+建物) 臺北市大安區和平東路三段1巷72弄1~30號
## 2     1 中正區 房地(土地+建物)     臺北市中正區忠孝東路二段121~150號
## 3     2 大同區            土地               橋北段二小段601~630地號
## 4     3 大同區 房地(土地+建物)       臺北市大同區重慶北路一段61~90號
## 5     4 內湖區 房地(土地+建物) 臺北市內湖區民權東路六段90巷6弄1~30號
## 6     5 信義區            土地               福德段一小段661~690地號
## # ... with 25 more variables: land_sqmeter <dbl>, city_land_type <chr>,
## #   non_city_land_type <chr>, non_city_code <chr>, trading_ymd <date>,
## #   trading_num <chr>, floor <chr>, total_floor <chr>,
## #   building_type <chr>, main_purpose <chr>, built_with <chr>,
## #   finish_ymd <date>, building_sqmeter <dbl>, room <int>,
## #   living_room <int>, bath <int>, compartment <chr>, management <chr>,
## #   total_price <int>, price_per_sqmeter <dbl>, parking_type <chr>,
## #   parking_sqmeter <dbl>, parking_price <int>, comments <chr>,
## #   numbers <chr>
lvr_price <- lvr_price[(lvr_price$trading_target == '房地(土地+建物)') & (lvr_price$city_land_type == '住'), c('building_sqmeter', 'total_price')]
lvr_price <- na.omit(lvr_price)
lvr_price <- lvr_price[(lvr_price$building_sqmeter > 0) & (lvr_price$total_price > 0 ),]

plot(log(total_price) ~ log(building_sqmeter),  data = lvr_price)

fit <- lm(total_price ~ building_sqmeter, data = lvr_price)
fit 
## 
## Call:
## lm(formula = total_price ~ building_sqmeter, data = lvr_price)
## 
## Coefficients:
##      (Intercept)  building_sqmeter  
##         -1138131            184557
184556 / 0.3025
## [1] 610102.5
plot(log(total_price) ~ log(building_sqmeter),  data = lvr_price)

summary(lvr_price$total_price)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 1.000e+00 8.800e+06 1.360e+07 1.701e+07 2.000e+07 2.009e+09
fit2 <- lm(log(total_price) ~ log(building_sqmeter), data = lvr_price)

plot(log(total_price) ~ log(building_sqmeter),  data = lvr_price)
abline(fit2, col="red")

fit
## 
## Call:
## lm(formula = total_price ~ building_sqmeter, data = lvr_price)
## 
## Coefficients:
##      (Intercept)  building_sqmeter  
##         -1138131            184557
summary(fit)
## 
## Call:
## lm(formula = total_price ~ building_sqmeter, data = lvr_price)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -556591677   -4575681    -824534    3136806  600071076 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -1138130.9   100923.0  -11.28   <2e-16 ***
## building_sqmeter   184556.7      706.5  261.23   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13660000 on 34821 degrees of freedom
## Multiple R-squared:  0.6621, Adjusted R-squared:  0.6621 
## F-statistic: 6.824e+04 on 1 and 34821 DF,  p-value: < 2.2e-16

模型評估

fit <- lm(y3 ~ x1, data = anscombe)
plot(y3 ~ x1, data = anscombe)
abline(fit, col = 'red')

predicted <- predict(fit, newdata = anscombe[c('x1')])
actual    <- anscombe$y3

rmse1 <- (mean((predicted - actual)^2)) ^ 0.5
rmse1
## [1] 1.118286
mu <- mean(actual)
rse1 <- mean((predicted - actual)^2)/mean((mu - actual)^2)
rse1
## [1] 0.333676
rsquare1 <- 1 - rse1
rsquare1
## [1] 0.666324
rmse1
## [1] 1.118286
rse1
## [1] 0.333676
rsquare1
## [1] 0.666324
fit2 <- rlm(y3 ~ x1, data = anscombe)
plot(y3 ~ x1, data = anscombe)
abline(fit, col = 'red')

predicted <- predict(fit2, newdata = anscombe[c('x1')])
actual    <- anscombe$y3

rmse2 <- (mean((predicted - actual)^2)) ^ 0.5
rmse2
## [1] 1.279045
mu <- mean(actual)
rse2 <- mean((predicted - actual)^2)/mean((mu - actual)^2)
rse2
## [1] 0.4365067
rsquare2 <- 1 - rse2
rsquare2
## [1] 0.5634933
rmse1
## [1] 1.118286
rmse2
## [1] 1.279045
rse1
## [1] 0.333676
rse2
## [1] 0.4365067
rsquare1
## [1] 0.666324
rsquare2
## [1] 0.5634933

 多元迴歸模型

house_price <- read.csv('https://raw.githubusercontent.com/ywchiu/fuboni/master/data/house-prices.csv')
head(house_price)


house_price$brick_d    <- ifelse(house_price$Brick=="Yes",1,0)
house_price$east       <- ifelse(house_price$Neighborhood=="East",1,0)
house_price$north      <- ifelse(house_price$Neighborhood=="North",1,0)

head(house_price,30)

set.seed(110)
sub <- sample(nrow(house_prices), floor(nrow(house_prices) * 0.6))
training_data   <- house_price[sub,]
validation_data <- house_price[-sub,]

lm.fit1 <- lm(Price ~ SqFt+Bathrooms+Bedrooms+Offers+
                north+east+brick_d, data=training_data)
summary(lm.fit1)

lm.fit1.step <- step(lm.fit1)

summary(lm.fit1.step)


training_data$predicte.price <- predict(lm.fit1)
training_data$error          <- residuals(lm.fit1)

validation_data$predict.price <- predict(lm.fit1,newdata = validation_data)
validation_data$error         <- validation_data$predict.price - validation_data$Price

hist(training_data$error)
hist(validation_data$error)


a <- cor(training_data$Price, training_data$predicte.price)
b <- cor(validation_data$Price, validation_data$predict.price)

a * a
b * b

決策樹

data(iris)
library(rpart)
## Warning: package 'rpart' was built under R version 3.4.2
fit <- rpart(Species  ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris)
plot(fit, margin= 0.1)
text(fit)

plot(Petal.Width ~ Petal.Length, data = iris, col = iris$Species)
abline(v = 2.45, col = 'orange')
abline(h = 1.75, col = 'blue')

predicted <- predict(fit, iris[,1:4], type = 'class')
table(iris[,5], predicted)
##             predicted
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         1
##   virginica       0          5        45

 邏輯式迴歸

iris.subset <- iris[iris$Species!= 'setosa', ]
str(iris.subset)
## 'data.frame':    100 obs. of  5 variables:
##  $ Sepal.Length: num  7 6.4 6.9 5.5 6.5 5.7 6.3 4.9 6.6 5.2 ...
##  $ Sepal.Width : num  3.2 3.2 3.1 2.3 2.8 2.8 3.3 2.4 2.9 2.7 ...
##  $ Petal.Length: num  4.7 4.5 4.9 4 4.6 4.5 4.7 3.3 4.6 3.9 ...
##  $ Petal.Width : num  1.4 1.5 1.5 1.3 1.5 1.3 1.6 1 1.3 1.4 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 2 2 2 2 2 2 2 2 2 2 ...
iris.subset$Species <- factor(iris.subset$Species)


fit <- glm(Species ~ . , data = iris.subset, family = 'binomial')
summary(fit)
## 
## Call:
## glm(formula = Species ~ ., family = "binomial", data = iris.subset)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.01105  -0.00541  -0.00001   0.00677   1.78065  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept)   -42.638     25.707  -1.659   0.0972 .
## Sepal.Length   -2.465      2.394  -1.030   0.3032  
## Sepal.Width    -6.681      4.480  -1.491   0.1359  
## Petal.Length    9.429      4.737   1.991   0.0465 *
## Petal.Width    18.286      9.743   1.877   0.0605 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 138.629  on 99  degrees of freedom
## Residual deviance:  11.899  on 95  degrees of freedom
## AIC: 21.899
## 
## Number of Fisher Scoring iterations: 10

SVM

library(e1071)
## Warning: package 'e1071' was built under R version 3.4.3
#?svm
data(iris)
#??e1071




fit1 <- rpart(Species~., data = iris)
predicted1 <- predict(fit1, type= 'class')
table(iris$Species, predicted1)
##             predicted1
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         49         1
##   virginica       0          5        45
fit2 <- svm(Species~., data = iris)
predicted2 <- predict(fit2, type = 'class')
table(iris$Species, predicted2)
##             predicted2
##              setosa versicolor virginica
##   setosa         50          0         0
##   versicolor      0         48         2
##   virginica       0          2        48