require(UsingR)
require(ggplot2)
require(tidyr)
require(dplyr)
require(plotly)

1 単回帰

data("father.son")
head(father.son)
##    fheight  sheight
## 1 65.04851 59.77827
## 2 63.25094 63.21404
## 3 64.95532 63.34242
## 4 65.75250 62.79238
## 5 61.13723 64.28113
## 6 63.02254 64.24221
ggplot(father.son , aes(x = fheight , y = sheight)) + geom_point() +
   stat_smooth(method = "lm") + labs(x = "Fathers" , y = "sons")

heightsLM <- lm(sheight ~ fheight , data = father.son)
heightsLM
## 
## Call:
## lm(formula = sheight ~ fheight, data = father.son)
## 
## Coefficients:
## (Intercept)      fheight  
##     33.8866       0.5141

結果の詳細を見るにはsummary()を使う。

summary(heightsLM)
## 
## Call:
## lm(formula = sheight ~ fheight, data = father.son)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.8772 -1.5144 -0.0079  1.6285  8.9685 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 33.88660    1.83235   18.49   <2e-16 ***
## fheight      0.51409    0.02705   19.01   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.437 on 1076 degrees of freedom
## Multiple R-squared:  0.2513, Adjusted R-squared:  0.2506 
## F-statistic: 361.2 on 1 and 1076 DF,  p-value: < 2.2e-16

2 重回帰

housing <- read.table ("http://www.jaredlander.com/data/housing.csv",
   sep = ",", header = TRUE,
   stringsAsFactors = FALSE)
names(housing) <- c("Neighborhood" , "Class" , "Units" , "YearBuilt" ,
    "SqFt" , "Income" , "IncomeperSqFt" , "Expence" ,
    "ExpencePerSqFt" , "NetIncome" , "Value" ,
    "ValueperSqFt" , "Boro")
housing <- housing[housing$Units < 1000 ,] # 外れ値を除く
head(housing)
##   Neighborhood          Class Units YearBuilt   SqFt   Income
## 1    FINANCIAL R9-CONDOMINIUM    42      1920  36500  1332615
## 2    FINANCIAL R4-CONDOMINIUM    78      1985 126420  6633257
## 3    FINANCIAL RR-CONDOMINIUM   500        NA 554174 17310000
## 4    FINANCIAL R4-CONDOMINIUM   282      1930 249076 11776313
## 5      TRIBECA R4-CONDOMINIUM   239      1985 219495 10004582
## 6      TRIBECA R4-CONDOMINIUM   133      1986 139719  5127687
##   IncomeperSqFt Expence ExpencePerSqFt NetIncome    Value ValueperSqFt
## 1         36.51  342005           9.37    990610  7300000       200.00
## 2         52.47 1762295          13.94   4870962 30690000       242.76
## 3         31.24 3543000           6.39  13767000 90970000       164.15
## 4         47.28 2784670          11.18   8991643 67556006       271.23
## 5         45.58 2783197          12.68   7221385 54320996       247.48
## 6         36.70 1497788          10.72   3629899 26737996       191.37
##        Boro
## 1 Manhattan
## 2 Manhattan
## 3 Manhattan
## 4 Manhattan
## 5 Manhattan
## 6 Manhattan
house1 <- lm(ValueperSqFt ~ Units + SqFt + Boro , data = housing)
summary(house1)
## 
## Call:
## lm(formula = ValueperSqFt ~ Units + SqFt + Boro, data = housing)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -168.458  -22.680    1.493   26.290  261.761 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        4.430e+01  5.342e+00   8.293  < 2e-16 ***
## Units             -1.532e-01  2.421e-02  -6.330 2.88e-10 ***
## SqFt               2.070e-04  2.129e-05   9.723  < 2e-16 ***
## BoroBrooklyn       3.258e+01  5.561e+00   5.858 5.28e-09 ***
## BoroManhattan      1.274e+02  5.459e+00  23.343  < 2e-16 ***
## BoroQueens         3.011e+01  5.711e+00   5.272 1.46e-07 ***
## BoroStaten Island -7.114e+00  1.001e+01  -0.711    0.477    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 43.2 on 2613 degrees of freedom
## Multiple R-squared:  0.6034, Adjusted R-squared:  0.6025 
## F-statistic: 662.6 on 6 and 2613 DF,  p-value: < 2.2e-16
require(coefplot)
## Loading required package: coefplot
coefplot(house1)

相互作用も入れられる。次の式は\(y=a(x_1 + x_2)^2 + b + \epsilon\)で回帰する。I()を使わないとフォーミュラ式は\(y=a x_1 + b x_2 + c x_1 * x_2 + d + \epsilon\)と解釈してしまう。

house9 <- lm(ValueperSqFt ~ I(Units + SqFt)^2 , data = housing)
coefplot(house9)

3 汎化誤差

noise <- 0.2
x.train <- seq(1, 2*pi, length.out = 10)
y.train <- sin(x.train) + rnorm(length(x.train), mean = 0, sd = noise)
x.test <- seq(1, 2*pi, length.out = 10)
y.test <- sin(x.train) + rnorm(length(x.train), mean = 0, sd = noise)
model.1 <- lm(formula = y.train ~ x.train)
df <- data.frame(x=x.train, y.train=y.train, y.test=y.test)
ggplot(df, aes(x=x.train)) + geom_point(aes(y=y.train, shape='train')) + geom_point(aes(y=y.test, shape='test')) + 
  geom_smooth(method='lm', aes(x=x.train, y=y.train), se=F)

ggplot(df, aes(x=x.train)) + geom_point(aes(y=y.train, shape='train')) + geom_point(aes(y=y.test, shape='test')) + 
  geom_smooth(method='lm', formula = 'y~I(x^2)+x',aes(x=x.train, y=y.train), se=F)

ggplot(df, aes(x=x.train)) + geom_point(aes(y=y.train, shape='train')) + geom_point(aes(y=y.test, shape='test')) + 
  geom_smooth(method='lm', formula='y~I(x^3) + I(x^2) + x', aes(x=x.train, y=y.train), se=F)

rmse.train <- c()
rmse.test <- c()
for (deg in 1:9) {
  model <- lm(formula = y.train~poly(x.train,deg))
  rmse <- sqrt( sum( ( y.train - predict(model) )^2 ) / length(x.train) )
  rmse.train <- c(rmse.train,rmse)
  rmse <- sqrt( sum( ( y.test - predict(model) )^2 ) / length(x.train) )
  rmse.test <- c(rmse.test,rmse)
}
df <- data.frame(degree =rep(1:9,2), rmseError=c(rmse.train, rmse.test), data=c(rep('train',9), rep('test',9)))
ggplot(df,  aes(x=degree, y=rmseError, group=data, color=data)) + geom_point()

汎化誤差が最小になるのは三次のモデルである。