Rの導入

read file

library(ggplot2)
# tsv
body.data <- read.table("body_sample.tsv", header = T, stringsAsFactors = F)
head(body.data)
##   id gender height weight
## 1  1      M 157.67  64.82
## 2  2      M 178.76  72.38
## 3  3      M 161.95  64.52
## 4  4      M 162.26  63.35
## 5  5      M 167.95  68.76
## 6  6      M 165.59  66.40
#csv
body.data <- read.csv("body_sample.csv", header = T, stringsAsFactors = F)
head(body.data)
##   id gender height weight
## 1  1      M 157.67  64.82
## 2  2      M 178.76  72.38
## 3  3      M 161.95  64.52
## 4  4      M 162.26  63.35
## 5  5      M 167.95  68.76
## 6  6      M 165.59  66.40

summary

summary(body.data)
##        id           gender              height          weight     
##  Min.   :  1.0   Length:400         Min.   :135.5   Min.   :31.44  
##  1st Qu.:100.8   Class :character   1st Qu.:152.4   1st Qu.:50.93  
##  Median :200.5   Mode  :character   Median :158.2   Median :57.78  
##  Mean   :200.5                      Mean   :158.4   Mean   :58.16  
##  3rd Qu.:300.2                      3rd Qu.:163.9   3rd Qu.:65.53  
##  Max.   :400.0                      Max.   :181.5   Max.   :78.99

標準偏差と不偏分散

sd(body.data$height)
## [1] 8.210723
var(body.data$weight)
## [1] 84.0654

histogram

ggplot(body.data, aes(x=height)) +
  geom_histogram() +
  theme_bw(16) +
  ylab("count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

histogram by male and female

ggplot(body.data, aes(x=height, fill=gender)) +
  geom_histogram() +
  theme_bw(16) +
  ylab("count")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

boxplot

ggplot(body.data, aes(x=gender, y=height, fill=gender)) +
  geom_boxplot() +
  theme_bw(16)

point plot with height and weight

ggplot(body.data, aes(x=height, y=weight)) +
  geom_point() +
  theme_bw(16)

point plot and 回帰直線 with height and weight

ggplot(body.data, aes(x=height, y=weight)) +
  geom_point() +
  theme_bw(16) +
  geom_smooth(method = "lm")

男女別point plot and 回帰直線 with height and weight

ggplot(body.data, aes(x=height, y=weight, col=gender)) +
  geom_point() +
  theme_bw(16) +
  geom_smooth(method = "lm")

相関関係

# total
cor(body.data$height, body.data$weight)
## [1] 0.8928748
# male
body.data.m <- body.data[body.data$gender == "M",]
cor(body.data.m$height, body.data.m$weight)
## [1] 0.863457
# female
body.data.f <- body.data[body.data$gender == "F",]
cor(body.data.f$height, body.data.f$weight)
## [1] 0.9173599

仮想の投資と売上データの散布図

amount1.data <- read.csv("amount1.csv")
head(amount1.data)
##     amount    invest
## 1 296.6501 132.23378
## 2 268.5920 107.81794
## 3 304.8194  90.57184
## 4 263.6498  77.51110
## 5 246.7538  74.13405
## 6 309.6545 161.44121
summary(amount1.data)
##      amount          invest      
##  Min.   :220.2   Min.   : 37.02  
##  1st Qu.:279.4   1st Qu.:103.92  
##  Median :302.0   Median :145.05  
##  Mean   :298.1   Mean   :150.01  
##  3rd Qu.:320.8   3rd Qu.:196.05  
##  Max.   :354.5   Max.   :258.54
ggplot(amount1.data, aes(x=invest, y=amount)) +
  geom_point() +
  theme_bw(16)

線形回帰モデルの構築

amount1.lm1 <- lm(amount~invest, data=amount1.data)
summary(amount1.lm1)
## 
## Call:
## lm(formula = amount ~ invest, data = amount1.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.521  -8.274   0.322   8.700  36.961 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 229.81293    2.56174   89.71   <2e-16 ***
## invest        0.45554    0.01593   28.60   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.07 on 198 degrees of freedom
## Multiple R-squared:  0.8051, Adjusted R-squared:  0.8042 
## F-statistic: 818.1 on 1 and 198 DF,  p-value: < 2.2e-16

逓減型回帰モデル

ggplot(amount1.data, aes(x=invest, y=amount)) +
  geom_point() +
  theme_bw(16) +
  geom_smooth(method = "lm", formula = y ~ log(x))

残差の分布

plot(amount1.lm1, which=1)

逓減型回帰モデル構築

amount1.lm2 <- lm(amount~log(invest), data=amount1.data)
summary(amount1.lm2)
## 
## Call:
## lm(formula = amount ~ log(invest), data = amount1.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.328  -7.181  -0.875   7.540  32.365 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -5.005      9.436   -0.53    0.596    
## log(invest)   61.574      1.909   32.26   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.84 on 198 degrees of freedom
## Multiple R-squared:  0.8401, Adjusted R-squared:  0.8393 
## F-statistic:  1040 on 1 and 198 DF,  p-value: < 2.2e-16

逓減型回帰モデルでの残差

plot(amount1.lm2, which=1)

ロジスティック回帰モデル

z <- data.frame(Titanic)
titanic.data <- data.frame(
  Class=rep(z$Class, z$Freq),
  Sex=rep(z$Sex, z$Freq),
  Age=rep(z$Age, z$Freq),
  Survived=rep(z$Survived, z$Freq)
)
titanic.logit <- glm(Survived~., data=titanic.data, family = binomial)
summary(titanic.logit)
## 
## Call:
## glm(formula = Survived ~ ., family = binomial, data = titanic.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0812  -0.7149  -0.6656   0.6858   2.1278  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.6853     0.2730   2.510   0.0121 *  
## Class2nd     -1.0181     0.1960  -5.194 2.05e-07 ***
## Class3rd     -1.7778     0.1716 -10.362  < 2e-16 ***
## ClassCrew    -0.8577     0.1573  -5.451 5.00e-08 ***
## SexFemale     2.4201     0.1404  17.236  < 2e-16 ***
## AgeAdult     -1.0615     0.2440  -4.350 1.36e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2769.5  on 2200  degrees of freedom
## Residual deviance: 2210.1  on 2195  degrees of freedom
## AIC: 2222.1
## 
## Number of Fisher Scoring iterations: 4

epicalcパッケージの読み込みとオッズの算出

#install.packages("devtools")
library(devtools)
## Warning: package 'devtools' was built under R version 3.4.3
#install_github("cran/epicalc")
library(epicalc)
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## Loading required package: nnet
## 
## Attaching package: 'epicalc'
## The following object is masked from 'package:ggplot2':
## 
##     alpha
logistic.display(titanic.logit, simplified = T)
##  
##                   OR lower95ci  upper95ci     Pr(>|Z|)
## Class2nd   0.3612825 0.2460447  0.5304933 2.053331e-07
## Class3rd   0.1690159 0.1207510  0.2365727 3.691891e-25
## ClassCrew  0.4241466 0.3115940  0.5773549 5.004592e-08
## SexFemale 11.2465380 8.5408719 14.8093331 1.431830e-66
## AgeAdult   0.3459219 0.2144193  0.5580746 1.360490e-05

決定木モデル

#install.packages("partykit")
library(rpart)
library(partykit)
## Warning: package 'partykit' was built under R version 3.4.3
## Loading required package: grid
## Loading required package: libcoin
## Warning: package 'libcoin' was built under R version 3.4.3
## Loading required package: mvtnorm
titanic.rp <- rpart(Survived~., data = titanic.data)
plot(as.party(titanic.rp), tp_args = T)

主成分分析

state.pca <- prcomp(state.x77[, 1:6], scale = T)
biplot(state.pca)

MDS

hdist <- read.table("HokkaidoCitiesMDS.tsv", header = F)
head(hdist)
##    V1  V2  V3  V4  V5  V6  V7  V8
## 1   0 115 274 249 152  88 152  30
## 2 115   0 202 188 118 200 263 130
## 3 274 202   0 358 313 360 413 266
## 4 249 188 358   0  98 290 330 277
## 5 152 118 313  98   0 195 240 180
## 6  88 200 360 290 195   0  64  98
hcities <- c("札幌","旭川","稚内市","釧路市","帯広市","室蘭市","函館","小樽")
names(hdist) <- hcities
head(hdist)
##   札幌 旭川 稚内市 釧路市 帯広市 室蘭市 函館 小樽
## 1    0  115    274    249    152     88  152   30
## 2  115    0    202    188    118    200  263  130
## 3  274  202      0    358    313    360  413  266
## 4  249  188    358      0     98    290  330  277
## 5  152  118    313     98      0    195  240  180
## 6   88  200    360    290    195      0   64   98
rownames(hdist) <- hcities
head(hdist)
##        札幌 旭川 稚内市 釧路市 帯広市 室蘭市 函館 小樽
## 札幌      0  115    274    249    152     88  152   30
## 旭川    115    0    202    188    118    200  263  130
## 稚内市  274  202      0    358    313    360  413  266
## 釧路市  249  188    358      0     98    290  330  277
## 帯広市  152  118    313     98      0    195  240  180
## 室蘭市   88  200    360    290    195      0   64   98
hdist.cmd <- cmdscale(hdist)
head(hdist.cmd)
##              [,1]        [,2]
## 札幌     32.93767  -39.528695
## 旭川    -74.44872    7.516849
## 稚内市 -226.74991 -119.948212
## 釧路市  -53.59358  193.147210
## 帯広市  -10.26623  105.270465
## 室蘭市  122.52705  -37.108496
class(hdist.cmd)
## [1] "matrix"
hdist.cmd.df <- as.data.frame(hdist.cmd)
head(hdist.cmd.df)
##                V1          V2
## 札幌     32.93767  -39.528695
## 旭川    -74.44872    7.516849
## 稚内市 -226.74991 -119.948212
## 釧路市  -53.59358  193.147210
## 帯広市  -10.26623  105.270465
## 室蘭市  122.52705  -37.108496
class(hdist.cmd.df)
## [1] "data.frame"
hdist.cmd.df$city <- rownames(hdist.cmd.df)
head(hdist.cmd.df)
##                V1          V2   city
## 札幌     32.93767  -39.528695   札幌
## 旭川    -74.44872    7.516849   旭川
## 稚内市 -226.74991 -119.948212 稚内市
## 釧路市  -53.59358  193.147210 釧路市
## 帯広市  -10.26623  105.270465 帯広市
## 室蘭市  122.52705  -37.108496 室蘭市
names(hdist.cmd.df) <- c("x","y","city")
head(hdist.cmd.df)
##                 x           y   city
## 札幌     32.93767  -39.528695   札幌
## 旭川    -74.44872    7.516849   旭川
## 稚内市 -226.74991 -119.948212 稚内市
## 釧路市  -53.59358  193.147210 釧路市
## 帯広市  -10.26623  105.270465 帯広市
## 室蘭市  122.52705  -37.108496 室蘭市
ggplot(hdist.cmd.df, aes(x=-x, y=-y, label=city)) +
  geom_text() +
  theme_bw(16)

k-means

library(ggplot2)
state.km <-  kmeans(scale(state.x77[,1:6]),3)
# 主成分分析の結果にクラスターの情報を付与する
state.pca.df <- data.frame(state.pca$x)
state.pca.df$name <- rownames(state.pca.df)
state.pca.df$cluster <- as.factor(state.km$cluster)
# 表示
ggplot(state.pca.df, aes(x=PC1,y=PC2,label=name,col=cluster)) +
  geom_text() +
  theme_bw(16)

radar chart

#install.packages("fmsb")
library(fmsb)
df <- as.data.frame(scale(state.km$centers))
dfmax <- apply(df,2,max)+1
dfmax
## Population     Income Illiteracy   Life Exp     Murder    HS Grad 
##   2.089196   1.859270   2.125864   2.056296   1.859310   1.750673
dfmin <- apply(df,2,min)-1
dfmin
## Population     Income Illiteracy   Life Exp     Murder    HS Grad 
##  -1.876620  -2.097648  -1.785020  -1.932103  -2.097630  -2.135184
df <- rbind(dfmax,dfmin,df)
df
##    Population     Income Illiteracy   Life Exp     Murder    HS Grad
## 1   2.0891961  1.8592702  2.1258639  2.0562955  1.8593096  1.7506728
## 2  -1.8766204 -2.0976478 -1.7850195 -1.9321030 -2.0976295 -2.1351836
## 11 -0.2125757 -1.0976478  1.1258639 -0.9321030  0.8593096 -1.1351836
## 21  1.0891961  0.8592702 -0.3408443 -0.1241926  0.2383199  0.3845108
## 3  -0.8766204  0.2383776 -0.7850195  1.0562955 -1.0976295  0.7506728
radarchart(df,seg=5,plty=1,pcol=rainbow(3))
legend("topright", legend=1:3, col=rainbow(3), lty=1)

SVM and RandomForest by caret