Rの導入
read file
library(ggplot2)
# tsv
body.data <- read.table("body_sample.tsv", header = T, stringsAsFactors = F)
head(body.data)
## id gender height weight
## 1 1 M 157.67 64.82
## 2 2 M 178.76 72.38
## 3 3 M 161.95 64.52
## 4 4 M 162.26 63.35
## 5 5 M 167.95 68.76
## 6 6 M 165.59 66.40
#csv
body.data <- read.csv("body_sample.csv", header = T, stringsAsFactors = F)
head(body.data)
## id gender height weight
## 1 1 M 157.67 64.82
## 2 2 M 178.76 72.38
## 3 3 M 161.95 64.52
## 4 4 M 162.26 63.35
## 5 5 M 167.95 68.76
## 6 6 M 165.59 66.40
summary
summary(body.data)
## id gender height weight
## Min. : 1.0 Length:400 Min. :135.5 Min. :31.44
## 1st Qu.:100.8 Class :character 1st Qu.:152.4 1st Qu.:50.93
## Median :200.5 Mode :character Median :158.2 Median :57.78
## Mean :200.5 Mean :158.4 Mean :58.16
## 3rd Qu.:300.2 3rd Qu.:163.9 3rd Qu.:65.53
## Max. :400.0 Max. :181.5 Max. :78.99
標準偏差と不偏分散
sd(body.data$height)
## [1] 8.210723
var(body.data$weight)
## [1] 84.0654
histogram
ggplot(body.data, aes(x=height)) +
geom_histogram() +
theme_bw(16) +
ylab("count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

histogram by male and female
ggplot(body.data, aes(x=height, fill=gender)) +
geom_histogram() +
theme_bw(16) +
ylab("count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

boxplot
ggplot(body.data, aes(x=gender, y=height, fill=gender)) +
geom_boxplot() +
theme_bw(16)

point plot with height and weight
ggplot(body.data, aes(x=height, y=weight)) +
geom_point() +
theme_bw(16)

point plot and 回帰直線 with height and weight
ggplot(body.data, aes(x=height, y=weight)) +
geom_point() +
theme_bw(16) +
geom_smooth(method = "lm")

男女別point plot and 回帰直線 with height and weight
ggplot(body.data, aes(x=height, y=weight, col=gender)) +
geom_point() +
theme_bw(16) +
geom_smooth(method = "lm")

相関関係
# total
cor(body.data$height, body.data$weight)
## [1] 0.8928748
# male
body.data.m <- body.data[body.data$gender == "M",]
cor(body.data.m$height, body.data.m$weight)
## [1] 0.863457
# female
body.data.f <- body.data[body.data$gender == "F",]
cor(body.data.f$height, body.data.f$weight)
## [1] 0.9173599
仮想の投資と売上データの散布図
amount1.data <- read.csv("amount1.csv")
head(amount1.data)
## amount invest
## 1 296.6501 132.23378
## 2 268.5920 107.81794
## 3 304.8194 90.57184
## 4 263.6498 77.51110
## 5 246.7538 74.13405
## 6 309.6545 161.44121
summary(amount1.data)
## amount invest
## Min. :220.2 Min. : 37.02
## 1st Qu.:279.4 1st Qu.:103.92
## Median :302.0 Median :145.05
## Mean :298.1 Mean :150.01
## 3rd Qu.:320.8 3rd Qu.:196.05
## Max. :354.5 Max. :258.54
ggplot(amount1.data, aes(x=invest, y=amount)) +
geom_point() +
theme_bw(16)

線形回帰モデルの構築
amount1.lm1 <- lm(amount~invest, data=amount1.data)
summary(amount1.lm1)
##
## Call:
## lm(formula = amount ~ invest, data = amount1.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -31.521 -8.274 0.322 8.700 36.961
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 229.81293 2.56174 89.71 <2e-16 ***
## invest 0.45554 0.01593 28.60 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13.07 on 198 degrees of freedom
## Multiple R-squared: 0.8051, Adjusted R-squared: 0.8042
## F-statistic: 818.1 on 1 and 198 DF, p-value: < 2.2e-16
逓減型回帰モデル
ggplot(amount1.data, aes(x=invest, y=amount)) +
geom_point() +
theme_bw(16) +
geom_smooth(method = "lm", formula = y ~ log(x))

残差の分布
plot(amount1.lm1, which=1)

逓減型回帰モデル構築
amount1.lm2 <- lm(amount~log(invest), data=amount1.data)
summary(amount1.lm2)
##
## Call:
## lm(formula = amount ~ log(invest), data = amount1.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -35.328 -7.181 -0.875 7.540 32.365
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.005 9.436 -0.53 0.596
## log(invest) 61.574 1.909 32.26 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.84 on 198 degrees of freedom
## Multiple R-squared: 0.8401, Adjusted R-squared: 0.8393
## F-statistic: 1040 on 1 and 198 DF, p-value: < 2.2e-16
逓減型回帰モデルでの残差
plot(amount1.lm2, which=1)

ロジスティック回帰モデル
z <- data.frame(Titanic)
titanic.data <- data.frame(
Class=rep(z$Class, z$Freq),
Sex=rep(z$Sex, z$Freq),
Age=rep(z$Age, z$Freq),
Survived=rep(z$Survived, z$Freq)
)
titanic.logit <- glm(Survived~., data=titanic.data, family = binomial)
summary(titanic.logit)
##
## Call:
## glm(formula = Survived ~ ., family = binomial, data = titanic.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0812 -0.7149 -0.6656 0.6858 2.1278
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.6853 0.2730 2.510 0.0121 *
## Class2nd -1.0181 0.1960 -5.194 2.05e-07 ***
## Class3rd -1.7778 0.1716 -10.362 < 2e-16 ***
## ClassCrew -0.8577 0.1573 -5.451 5.00e-08 ***
## SexFemale 2.4201 0.1404 17.236 < 2e-16 ***
## AgeAdult -1.0615 0.2440 -4.350 1.36e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2769.5 on 2200 degrees of freedom
## Residual deviance: 2210.1 on 2195 degrees of freedom
## AIC: 2222.1
##
## Number of Fisher Scoring iterations: 4
epicalcパッケージの読み込みとオッズの算出
#install.packages("devtools")
library(devtools)
## Warning: package 'devtools' was built under R version 3.4.3
#install_github("cran/epicalc")
library(epicalc)
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## Loading required package: nnet
##
## Attaching package: 'epicalc'
## The following object is masked from 'package:ggplot2':
##
## alpha
logistic.display(titanic.logit, simplified = T)
##
## OR lower95ci upper95ci Pr(>|Z|)
## Class2nd 0.3612825 0.2460447 0.5304933 2.053331e-07
## Class3rd 0.1690159 0.1207510 0.2365727 3.691891e-25
## ClassCrew 0.4241466 0.3115940 0.5773549 5.004592e-08
## SexFemale 11.2465380 8.5408719 14.8093331 1.431830e-66
## AgeAdult 0.3459219 0.2144193 0.5580746 1.360490e-05
決定木モデル
#install.packages("partykit")
library(rpart)
library(partykit)
## Warning: package 'partykit' was built under R version 3.4.3
## Loading required package: grid
## Loading required package: libcoin
## Warning: package 'libcoin' was built under R version 3.4.3
## Loading required package: mvtnorm
titanic.rp <- rpart(Survived~., data = titanic.data)
plot(as.party(titanic.rp), tp_args = T)

主成分分析
state.pca <- prcomp(state.x77[, 1:6], scale = T)
biplot(state.pca)

MDS
hdist <- read.table("HokkaidoCitiesMDS.tsv", header = F)
head(hdist)
## V1 V2 V3 V4 V5 V6 V7 V8
## 1 0 115 274 249 152 88 152 30
## 2 115 0 202 188 118 200 263 130
## 3 274 202 0 358 313 360 413 266
## 4 249 188 358 0 98 290 330 277
## 5 152 118 313 98 0 195 240 180
## 6 88 200 360 290 195 0 64 98
hcities <- c("札幌","旭川","稚内市","釧路市","帯広市","室蘭市","函館","小樽")
names(hdist) <- hcities
head(hdist)
## 札幌 旭川 稚内市 釧路市 帯広市 室蘭市 函館 小樽
## 1 0 115 274 249 152 88 152 30
## 2 115 0 202 188 118 200 263 130
## 3 274 202 0 358 313 360 413 266
## 4 249 188 358 0 98 290 330 277
## 5 152 118 313 98 0 195 240 180
## 6 88 200 360 290 195 0 64 98
rownames(hdist) <- hcities
head(hdist)
## 札幌 旭川 稚内市 釧路市 帯広市 室蘭市 函館 小樽
## 札幌 0 115 274 249 152 88 152 30
## 旭川 115 0 202 188 118 200 263 130
## 稚内市 274 202 0 358 313 360 413 266
## 釧路市 249 188 358 0 98 290 330 277
## 帯広市 152 118 313 98 0 195 240 180
## 室蘭市 88 200 360 290 195 0 64 98
hdist.cmd <- cmdscale(hdist)
head(hdist.cmd)
## [,1] [,2]
## 札幌 32.93767 -39.528695
## 旭川 -74.44872 7.516849
## 稚内市 -226.74991 -119.948212
## 釧路市 -53.59358 193.147210
## 帯広市 -10.26623 105.270465
## 室蘭市 122.52705 -37.108496
class(hdist.cmd)
## [1] "matrix"
hdist.cmd.df <- as.data.frame(hdist.cmd)
head(hdist.cmd.df)
## V1 V2
## 札幌 32.93767 -39.528695
## 旭川 -74.44872 7.516849
## 稚内市 -226.74991 -119.948212
## 釧路市 -53.59358 193.147210
## 帯広市 -10.26623 105.270465
## 室蘭市 122.52705 -37.108496
class(hdist.cmd.df)
## [1] "data.frame"
hdist.cmd.df$city <- rownames(hdist.cmd.df)
head(hdist.cmd.df)
## V1 V2 city
## 札幌 32.93767 -39.528695 札幌
## 旭川 -74.44872 7.516849 旭川
## 稚内市 -226.74991 -119.948212 稚内市
## 釧路市 -53.59358 193.147210 釧路市
## 帯広市 -10.26623 105.270465 帯広市
## 室蘭市 122.52705 -37.108496 室蘭市
names(hdist.cmd.df) <- c("x","y","city")
head(hdist.cmd.df)
## x y city
## 札幌 32.93767 -39.528695 札幌
## 旭川 -74.44872 7.516849 旭川
## 稚内市 -226.74991 -119.948212 稚内市
## 釧路市 -53.59358 193.147210 釧路市
## 帯広市 -10.26623 105.270465 帯広市
## 室蘭市 122.52705 -37.108496 室蘭市
ggplot(hdist.cmd.df, aes(x=-x, y=-y, label=city)) +
geom_text() +
theme_bw(16)

k-means
library(ggplot2)
state.km <- kmeans(scale(state.x77[,1:6]),3)
# 主成分分析の結果にクラスターの情報を付与する
state.pca.df <- data.frame(state.pca$x)
state.pca.df$name <- rownames(state.pca.df)
state.pca.df$cluster <- as.factor(state.km$cluster)
# 表示
ggplot(state.pca.df, aes(x=PC1,y=PC2,label=name,col=cluster)) +
geom_text() +
theme_bw(16)

radar chart
#install.packages("fmsb")
library(fmsb)
df <- as.data.frame(scale(state.km$centers))
dfmax <- apply(df,2,max)+1
dfmax
## Population Income Illiteracy Life Exp Murder HS Grad
## 2.089196 1.859270 2.125864 2.056296 1.859310 1.750673
dfmin <- apply(df,2,min)-1
dfmin
## Population Income Illiteracy Life Exp Murder HS Grad
## -1.876620 -2.097648 -1.785020 -1.932103 -2.097630 -2.135184
df <- rbind(dfmax,dfmin,df)
df
## Population Income Illiteracy Life Exp Murder HS Grad
## 1 2.0891961 1.8592702 2.1258639 2.0562955 1.8593096 1.7506728
## 2 -1.8766204 -2.0976478 -1.7850195 -1.9321030 -2.0976295 -2.1351836
## 11 -0.2125757 -1.0976478 1.1258639 -0.9321030 0.8593096 -1.1351836
## 21 1.0891961 0.8592702 -0.3408443 -0.1241926 0.2383199 0.3845108
## 3 -0.8766204 0.2383776 -0.7850195 1.0562955 -1.0976295 0.7506728
radarchart(df,seg=5,plty=1,pcol=rainbow(3))
legend("topright", legend=1:3, col=rainbow(3), lty=1)

SVM and RandomForest by caret