Regression Model
x <- c(17,21,35,39,50,65)
y <- c(132, 150, 160, 162, 149, 170 )
plot(x, y, ylim = c(0,200), xlim = c(0,100))
x_avg <- mean(x)
y_avg <- mean(y)
B1 <- (sum(x*y ) - 6 * x_avg * y_avg) / (sum(x ^ 2) - 6 *x_avg ^ 2)
B1
## [1] 0.5529606
B0 <- y_avg - B1 * x_avg
B0
## [1] 132.913
B1
## [1] 0.5529606
plot(x, y, ylim = c(0,200), xlim = c(0,100))
fit <- lm(y ~ x)
fit
##
## Call:
## lm(formula = y ~ x)
##
## Coefficients:
## (Intercept) x
## 132.913 0.553
plot(x, y, ylim = c(0,200), xlim = c(0,100))
abline(fit, col = 'red')

Anscombe dataset
data("anscombe")
View(anscombe)
fit <- lm(y1~ x1, data = anscombe)
plot(y1 ~ x1, data = anscombe)
abline(fit, col = "red")

plot(y2 ~ x1, data = anscombe)
fit <- lm(y2~ x1, data = anscombe)
abline(fit, col="red")

fit2 <- lm(y2~ poly(x1, 2), data = anscombe)
fit2
##
## Call:
## lm(formula = y2 ~ poly(x1, 2), data = anscombe)
##
## Coefficients:
## (Intercept) poly(x1, 2)1 poly(x1, 2)2
## 7.501 5.244 -3.712
plot(y2 ~ x1, data = anscombe)
lines(sort(anscombe$x1), fit2$fitted.values[order(anscombe$x1)], col="red")

fit <- lm(y3 ~ x1, data = anscombe)
plot(y3 ~ x1, data = anscombe)
abline(fit, col="red")

library(MASS)
## Warning: package 'MASS' was built under R version 3.4.3
fit <- rlm(y3 ~ x1, data = anscombe)
plot(y3 ~ x1, data = anscombe)
abline(fit, col="red")

台北市房價分析
library(readr)
lvr_price <- read_csv('https://raw.githubusercontent.com/ywchiu/fuboni/master/data/lvr_prices.csv')
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_character(),
## X1 = col_integer(),
## land_sqmeter = col_double(),
## trading_ymd = col_date(format = ""),
## finish_ymd = col_date(format = ""),
## building_sqmeter = col_double(),
## room = col_integer(),
## living_room = col_integer(),
## bath = col_integer(),
## total_price = col_integer(),
## price_per_sqmeter = col_double(),
## parking_sqmeter = col_double(),
## parking_price = col_integer()
## )
## See spec(...) for full column specifications.
## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)
## Warning: 32 parsing failures.
## row # A tibble: 5 x 5 col row col expected actual expected <int> <chr> <chr> <chr> actual 1 1282 total_price an integer 6700000000 file 2 2243 total_price an integer 3882685600 row 3 2244 total_price an integer 3373314400 col 4 4629 total_price an integer 3050000000 expected 5 5890 total_price an integer 3133800000 actual # ... with 1 more variables: file <chr>
## ... ................. ... ......................................... ........ ......................................... ...... ......................................... .... ......................................... ... ......................................... ... ......................................... ........ ......................................... ...... .......................................
## See problems(...) for more details.
head(lvr_price)
## # A tibble: 6 x 29
## X1 area trading_target address
## <int> <chr> <chr> <chr>
## 1 0 大安區 房地(土地+建物) 臺北市大安區和平東路三段1巷72弄1~30號
## 2 1 中正區 房地(土地+建物) 臺北市中正區忠孝東路二段121~150號
## 3 2 大同區 土地 橋北段二小段601~630地號
## 4 3 大同區 房地(土地+建物) 臺北市大同區重慶北路一段61~90號
## 5 4 內湖區 房地(土地+建物) 臺北市內湖區民權東路六段90巷6弄1~30號
## 6 5 信義區 土地 福德段一小段661~690地號
## # ... with 25 more variables: land_sqmeter <dbl>, city_land_type <chr>,
## # non_city_land_type <chr>, non_city_code <chr>, trading_ymd <date>,
## # trading_num <chr>, floor <chr>, total_floor <chr>,
## # building_type <chr>, main_purpose <chr>, built_with <chr>,
## # finish_ymd <date>, building_sqmeter <dbl>, room <int>,
## # living_room <int>, bath <int>, compartment <chr>, management <chr>,
## # total_price <int>, price_per_sqmeter <dbl>, parking_type <chr>,
## # parking_sqmeter <dbl>, parking_price <int>, comments <chr>,
## # numbers <chr>
lvr_price <- lvr_price[(lvr_price$trading_target == '房地(土地+建物)') & (lvr_price$city_land_type == '住'), c('building_sqmeter', 'total_price')]
lvr_price <- na.omit(lvr_price)
lvr_price <- lvr_price[(lvr_price$building_sqmeter > 0) & (lvr_price$total_price > 0 ),]
plot(log(total_price) ~ log(building_sqmeter), data = lvr_price)
fit <- lm(total_price ~ building_sqmeter, data = lvr_price)
fit
##
## Call:
## lm(formula = total_price ~ building_sqmeter, data = lvr_price)
##
## Coefficients:
## (Intercept) building_sqmeter
## -1138131 184557
184556 / 0.3025
## [1] 610102.5
plot(log(total_price) ~ log(building_sqmeter), data = lvr_price)
summary(lvr_price$total_price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000e+00 8.800e+06 1.360e+07 1.701e+07 2.000e+07 2.009e+09
fit2 <- lm(log(total_price) ~ log(building_sqmeter), data = lvr_price)
plot(log(total_price) ~ log(building_sqmeter), data = lvr_price)
abline(fit2, col="red")

fit
##
## Call:
## lm(formula = total_price ~ building_sqmeter, data = lvr_price)
##
## Coefficients:
## (Intercept) building_sqmeter
## -1138131 184557
summary(fit)
##
## Call:
## lm(formula = total_price ~ building_sqmeter, data = lvr_price)
##
## Residuals:
## Min 1Q Median 3Q Max
## -556591677 -4575681 -824534 3136806 600071076
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1138130.9 100923.0 -11.28 <2e-16 ***
## building_sqmeter 184556.7 706.5 261.23 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13660000 on 34821 degrees of freedom
## Multiple R-squared: 0.6621, Adjusted R-squared: 0.6621
## F-statistic: 6.824e+04 on 1 and 34821 DF, p-value: < 2.2e-16
模型評估
fit <- lm(y3 ~ x1, data = anscombe)
plot(y3 ~ x1, data = anscombe)
abline(fit, col = 'red')

predicted <- predict(fit, newdata = anscombe[c('x1')])
actual <- anscombe$y3
rmse1 <- (mean((predicted - actual)^2)) ^ 0.5
rmse1
## [1] 1.118286
mu <- mean(actual)
rse1 <- mean((predicted - actual)^2)/mean((mu - actual)^2)
rse1
## [1] 0.333676
rsquare1 <- 1 - rse1
rsquare1
## [1] 0.666324
rmse1
## [1] 1.118286
rse1
## [1] 0.333676
rsquare1
## [1] 0.666324
fit2 <- rlm(y3 ~ x1, data = anscombe)
plot(y3 ~ x1, data = anscombe)
abline(fit, col = 'red')

predicted <- predict(fit2, newdata = anscombe[c('x1')])
actual <- anscombe$y3
rmse2 <- (mean((predicted - actual)^2)) ^ 0.5
rmse2
## [1] 1.279045
mu <- mean(actual)
rse2 <- mean((predicted - actual)^2)/mean((mu - actual)^2)
rse2
## [1] 0.4365067
rsquare2 <- 1 - rse2
rsquare2
## [1] 0.5634933
rmse1
## [1] 1.118286
rmse2
## [1] 1.279045
rse1
## [1] 0.333676
rse2
## [1] 0.4365067
rsquare1
## [1] 0.666324
rsquare2
## [1] 0.5634933
多元迴歸模型
house_price <- read.csv('https://raw.githubusercontent.com/ywchiu/fuboni/master/data/house-prices.csv')
head(house_price)
house_price$brick_d <- ifelse(house_price$Brick=="Yes",1,0)
house_price$east <- ifelse(house_price$Neighborhood=="East",1,0)
house_price$north <- ifelse(house_price$Neighborhood=="North",1,0)
head(house_price,30)
set.seed(110)
sub <- sample(nrow(house_prices), floor(nrow(house_prices) * 0.6))
training_data <- house_price[sub,]
validation_data <- house_price[-sub,]
lm.fit1 <- lm(Price ~ SqFt+Bathrooms+Bedrooms+Offers+
north+east+brick_d, data=training_data)
summary(lm.fit1)
lm.fit1.step <- step(lm.fit1)
summary(lm.fit1.step)
training_data$predicte.price <- predict(lm.fit1)
training_data$error <- residuals(lm.fit1)
validation_data$predict.price <- predict(lm.fit1,newdata = validation_data)
validation_data$error <- validation_data$predict.price - validation_data$Price
hist(training_data$error)
hist(validation_data$error)
a <- cor(training_data$Price, training_data$predicte.price)
b <- cor(validation_data$Price, validation_data$predict.price)
a * a
b * b
決策樹
data(iris)
library(rpart)
## Warning: package 'rpart' was built under R version 3.4.2
fit <- rpart(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = iris)
plot(fit, margin= 0.1)
text(fit)

plot(Petal.Width ~ Petal.Length, data = iris, col = iris$Species)
abline(v = 2.45, col = 'orange')
abline(h = 1.75, col = 'blue')

predicted <- predict(fit, iris[,1:4], type = 'class')
table(iris[,5], predicted)
## predicted
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 1
## virginica 0 5 45
邏輯式迴歸
iris.subset <- iris[iris$Species!= 'setosa', ]
str(iris.subset)
## 'data.frame': 100 obs. of 5 variables:
## $ Sepal.Length: num 7 6.4 6.9 5.5 6.5 5.7 6.3 4.9 6.6 5.2 ...
## $ Sepal.Width : num 3.2 3.2 3.1 2.3 2.8 2.8 3.3 2.4 2.9 2.7 ...
## $ Petal.Length: num 4.7 4.5 4.9 4 4.6 4.5 4.7 3.3 4.6 3.9 ...
## $ Petal.Width : num 1.4 1.5 1.5 1.3 1.5 1.3 1.6 1 1.3 1.4 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 2 2 2 2 2 2 2 2 2 2 ...
iris.subset$Species <- factor(iris.subset$Species)
fit <- glm(Species ~ . , data = iris.subset, family = 'binomial')
summary(fit)
##
## Call:
## glm(formula = Species ~ ., family = "binomial", data = iris.subset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.01105 -0.00541 -0.00001 0.00677 1.78065
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -42.638 25.707 -1.659 0.0972 .
## Sepal.Length -2.465 2.394 -1.030 0.3032
## Sepal.Width -6.681 4.480 -1.491 0.1359
## Petal.Length 9.429 4.737 1.991 0.0465 *
## Petal.Width 18.286 9.743 1.877 0.0605 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 138.629 on 99 degrees of freedom
## Residual deviance: 11.899 on 95 degrees of freedom
## AIC: 21.899
##
## Number of Fisher Scoring iterations: 10
SVM
library(e1071)
## Warning: package 'e1071' was built under R version 3.4.3
#?svm
data(iris)
#??e1071
fit1 <- rpart(Species~., data = iris)
predicted1 <- predict(fit1, type= 'class')
table(iris$Species, predicted1)
## predicted1
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 49 1
## virginica 0 5 45
fit2 <- svm(Species~., data = iris)
predicted2 <- predict(fit2, type = 'class')
table(iris$Species, predicted2)
## predicted2
## setosa versicolor virginica
## setosa 50 0 0
## versicolor 0 48 2
## virginica 0 2 48