Abstract
みんなのR、16章のお勉強。require(UsingR)
require(ggplot2)
require(tidyr)
require(dplyr)
require(plotly)
data("father.son")
head(father.son)
## fheight sheight
## 1 65.04851 59.77827
## 2 63.25094 63.21404
## 3 64.95532 63.34242
## 4 65.75250 62.79238
## 5 61.13723 64.28113
## 6 63.02254 64.24221
ggplot(father.son , aes(x = fheight , y = sheight)) + geom_point() +
stat_smooth(method = "lm") + labs(x = "Fathers" , y = "sons")
heightsLM <- lm(sheight ~ fheight , data = father.son)
heightsLM
##
## Call:
## lm(formula = sheight ~ fheight, data = father.son)
##
## Coefficients:
## (Intercept) fheight
## 33.8866 0.5141
結果の詳細を見るにはsummary()を使う。
summary(heightsLM)
##
## Call:
## lm(formula = sheight ~ fheight, data = father.son)
##
## Residuals:
## Min 1Q Median 3Q Max
## -8.8772 -1.5144 -0.0079 1.6285 8.9685
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 33.88660 1.83235 18.49 <2e-16 ***
## fheight 0.51409 0.02705 19.01 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.437 on 1076 degrees of freedom
## Multiple R-squared: 0.2513, Adjusted R-squared: 0.2506
## F-statistic: 361.2 on 1 and 1076 DF, p-value: < 2.2e-16
housing <- read.table ("http://www.jaredlander.com/data/housing.csv",
sep = ",", header = TRUE,
stringsAsFactors = FALSE)
names(housing) <- c("Neighborhood" , "Class" , "Units" , "YearBuilt" ,
"SqFt" , "Income" , "IncomeperSqFt" , "Expence" ,
"ExpencePerSqFt" , "NetIncome" , "Value" ,
"ValueperSqFt" , "Boro")
housing <- housing[housing$Units < 1000 ,] # 外れ値を除く
head(housing)
## Neighborhood Class Units YearBuilt SqFt Income
## 1 FINANCIAL R9-CONDOMINIUM 42 1920 36500 1332615
## 2 FINANCIAL R4-CONDOMINIUM 78 1985 126420 6633257
## 3 FINANCIAL RR-CONDOMINIUM 500 NA 554174 17310000
## 4 FINANCIAL R4-CONDOMINIUM 282 1930 249076 11776313
## 5 TRIBECA R4-CONDOMINIUM 239 1985 219495 10004582
## 6 TRIBECA R4-CONDOMINIUM 133 1986 139719 5127687
## IncomeperSqFt Expence ExpencePerSqFt NetIncome Value ValueperSqFt
## 1 36.51 342005 9.37 990610 7300000 200.00
## 2 52.47 1762295 13.94 4870962 30690000 242.76
## 3 31.24 3543000 6.39 13767000 90970000 164.15
## 4 47.28 2784670 11.18 8991643 67556006 271.23
## 5 45.58 2783197 12.68 7221385 54320996 247.48
## 6 36.70 1497788 10.72 3629899 26737996 191.37
## Boro
## 1 Manhattan
## 2 Manhattan
## 3 Manhattan
## 4 Manhattan
## 5 Manhattan
## 6 Manhattan
house1 <- lm(ValueperSqFt ~ Units + SqFt + Boro , data = housing)
summary(house1)
##
## Call:
## lm(formula = ValueperSqFt ~ Units + SqFt + Boro, data = housing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -168.458 -22.680 1.493 26.290 261.761
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.430e+01 5.342e+00 8.293 < 2e-16 ***
## Units -1.532e-01 2.421e-02 -6.330 2.88e-10 ***
## SqFt 2.070e-04 2.129e-05 9.723 < 2e-16 ***
## BoroBrooklyn 3.258e+01 5.561e+00 5.858 5.28e-09 ***
## BoroManhattan 1.274e+02 5.459e+00 23.343 < 2e-16 ***
## BoroQueens 3.011e+01 5.711e+00 5.272 1.46e-07 ***
## BoroStaten Island -7.114e+00 1.001e+01 -0.711 0.477
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 43.2 on 2613 degrees of freedom
## Multiple R-squared: 0.6034, Adjusted R-squared: 0.6025
## F-statistic: 662.6 on 6 and 2613 DF, p-value: < 2.2e-16
require(coefplot)
## Loading required package: coefplot
coefplot(house1)
相互作用も入れられる。次の式は\(y=a(x_1 + x_2)^2 + b + \epsilon\)で回帰する。
I()を使わないとフォーミュラ式は\(y=a x_1 + b x_2 + c x_1 * x_2 + d + \epsilon\)と解釈してしまう。
house9 <- lm(ValueperSqFt ~ I(Units + SqFt)^2 , data = housing)
coefplot(house9)
noise <- 0.2
x.train <- seq(1, 2*pi, length.out = 10)
y.train <- sin(x.train) + rnorm(length(x.train), mean = 0, sd = noise)
x.test <- seq(1, 2*pi, length.out = 10)
y.test <- sin(x.train) + rnorm(length(x.train), mean = 0, sd = noise)
model.1 <- lm(formula = y.train ~ x.train)
df <- data.frame(x=x.train, y.train=y.train, y.test=y.test)
ggplot(df, aes(x=x.train)) + geom_point(aes(y=y.train, shape='train')) + geom_point(aes(y=y.test, shape='test')) +
geom_smooth(method='lm', aes(x=x.train, y=y.train), se=F)
ggplot(df, aes(x=x.train)) + geom_point(aes(y=y.train, shape='train')) + geom_point(aes(y=y.test, shape='test')) +
geom_smooth(method='lm', formula = 'y~I(x^2)+x',aes(x=x.train, y=y.train), se=F)
ggplot(df, aes(x=x.train)) + geom_point(aes(y=y.train, shape='train')) + geom_point(aes(y=y.test, shape='test')) +
geom_smooth(method='lm', formula='y~I(x^3) + I(x^2) + x', aes(x=x.train, y=y.train), se=F)
rmse.train <- c()
rmse.test <- c()
for (deg in 1:9) {
model <- lm(formula = y.train~poly(x.train,deg))
rmse <- sqrt( sum( ( y.train - predict(model) )^2 ) / length(x.train) )
rmse.train <- c(rmse.train,rmse)
rmse <- sqrt( sum( ( y.test - predict(model) )^2 ) / length(x.train) )
rmse.test <- c(rmse.test,rmse)
}
df <- data.frame(degree =rep(1:9,2), rmseError=c(rmse.train, rmse.test), data=c(rep('train',9), rep('test',9)))
ggplot(df, aes(x=degree, y=rmseError, group=data, color=data)) + geom_point()
汎化誤差が最小になるのは三次のモデルである。