Abstract
みんなのR 20章の勉強ノートrequire(UsingR)
require(ggplot2)
require(tidyr)
require(dplyr)
require(plotly)
require(coefplot)
load("../data/wifi.rdata")
head(wifi)
## Distance x y
## 1 21.87559 28.60461 68.429628
## 2 67.68198 90.29680 29.155945
## 3 79.25427 83.48934 0.371902
## 4 44.73767 61.39133 80.258138
## 5 39.71233 19.55080 83.805855
## 6 56.65595 71.93928 65.551340
ggplot(wifi, aes(x=x, y=y, color=Distance)) + geom_point() +
scale_color_gradient2(low="blue", mid="white", high="red",
midpoint=mean(wifi$Distance))
wifiMod1 <- nls(Distance ~ sqrt((betaX - x)^2 + (betaY - y)^2),
data = wifi, start = list(betaX = 50, betaY = 50))
summary(wifiMod1)
##
## Formula: Distance ~ sqrt((betaX - x)^2 + (betaY - y)^2)
##
## Parameters:
## Estimate Std. Error t value Pr(>|t|)
## betaX 17.851 1.289 13.85 <2e-16 ***
## betaY 52.906 1.476 35.85 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.73 on 198 degrees of freedom
##
## Number of iterations to convergence: 6
## Achieved convergence tolerance: 3.846e-06
ggplot(wifi, aes(x = x, y = y, color = Distance)) + geom_point() +
scale_color_gradient2(low = "blue", mid = "white", high = "red",
midpoint = mean(wifi$Distance)) +
geom_point(data = as.data.frame(t(coef(wifiMod1))),
aes(x = betaX, y = betaY), size = 5, color = "green")
青い点が密集しているところにあるのでよい推定であることが分かる.
data("diamond")
ggplot(diamond, aes(x=carat, y=price)) + geom_point()
diaSpline1 <- smooth.spline(x=diamond$carat, y=diamond$price )
df <- data.frame(x=diaSpline1$x, y=diaSpline1$y)
ggplot(diamond, aes(x=carat, y=price))+ geom_point() +
geom_line(data=df, aes(x=x,y=y)) + scale_x_continuous(limits = c(0.1, 0.36)) +
scale_y_continuous(limits = c(0, 1100))
creditNames <- c("Checking", "Duration", "CreditHistory",
"Purpose", "CreditAmount", "Savings", "Employment",
"InstallmentRate", "GenderMarital", "OtherDebtors",
"YearsAtResidence", "RealEstate", "Age",
"OtherInstallment", "Housing", "ExistingCredits", "Job",
"NumLiable", "Phone", "Foreign", "Credit")
theURL <- "http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data"
credit <- read.table(theURL, sep = " ",header=FALSE,col.names=creditNames,stringsAsFactors = FALSE)
require(rpart)
## Loading required package: rpart
require(rpart.plot)
## Loading required package: rpart.plot
creditTree <- rpart(Credit ~ CreditAmount + Age +
+ CreditHistory + Employment, data = credit)
head(credit)
## Checking Duration CreditHistory Purpose CreditAmount Savings Employment
## 1 A11 6 A34 A43 1169 A65 A75
## 2 A12 48 A32 A43 5951 A61 A73
## 3 A14 12 A34 A46 2096 A61 A74
## 4 A11 42 A32 A42 7882 A61 A74
## 5 A11 24 A33 A40 4870 A61 A73
## 6 A14 36 A32 A46 9055 A65 A73
## InstallmentRate GenderMarital OtherDebtors YearsAtResidence RealEstate
## 1 4 A93 A101 4 A121
## 2 2 A92 A101 2 A121
## 3 2 A93 A101 3 A121
## 4 2 A93 A103 4 A122
## 5 3 A93 A101 4 A124
## 6 2 A93 A101 4 A124
## Age OtherInstallment Housing ExistingCredits Job NumLiable Phone
## 1 67 A143 A152 2 A173 1 A192
## 2 22 A143 A152 1 A173 1 A191
## 3 49 A143 A152 1 A172 2 A191
## 4 45 A143 A153 1 A173 2 A191
## 5 53 A143 A153 2 A173 2 A191
## 6 35 A143 A153 1 A172 2 A192
## Foreign Credit
## 1 A201 1
## 2 A201 2
## 3 A201 1
## 4 A201 1
## 5 A201 2
## 6 A201 1
rpart.plot(creditTree)
require(useful)
## Loading required package: useful
require(randomForest)
## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:Hmisc':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
# 予測因子と応答変数の matrix を作成する
creditFormula <- Credit ~ CreditHistory + Purpose + Employment +
Duration + Age + CreditAmount
creditX <- build.x(creditFormula, data=credit)
creditY <- build.y(creditFormula, data=credit)
# ランダムフォレストの適用
creditForest <- randomForest(x=creditX, y=creditY)
## Warning in randomForest.default(x = creditX, y = creditY): The response has
## five or fewer unique values. Are you sure you want to do regression?
creditForest
##
## Call:
## randomForest(x = creditX, y = creditY)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 7
##
## Mean of squared residuals: 0.1850619
## % Var explained: 11.88