Rの導入

read file

library(ggplot2)
# tsv
body.data <- read.table("body_sample.tsv", header = T, stringsAsFactors = F)
head(body.data)

##   id gender height weight
## 1  1      M 157.67  64.82
## 2  2      M 178.76  72.38
## 3  3      M 161.95  64.52
## 4  4      M 162.26  63.35
## 5  5      M 167.95  68.76
## 6  6      M 165.59  66.40

#csv
body.data <- read.csv("body_sample.csv", header = T, stringsAsFactors = F)
head(body.data)

##   id gender height weight
## 1  1      M 157.67  64.82
## 2  2      M 178.76  72.38
## 3  3      M 161.95  64.52
## 4  4      M 162.26  63.35
## 5  5      M 167.95  68.76
## 6  6      M 165.59  66.40

summary

summary(body.data)

##        id           gender              height          weight     
##  Min.   :  1.0   Length:400         Min.   :135.5   Min.   :31.44  
##  1st Qu.:100.8   Class :character   1st Qu.:152.4   1st Qu.:50.93  
##  Median :200.5   Mode  :character   Median :158.2   Median :57.78  
##  Mean   :200.5                      Mean   :158.4   Mean   :58.16  
##  3rd Qu.:300.2                      3rd Qu.:163.9   3rd Qu.:65.53  
##  Max.   :400.0                      Max.   :181.5   Max.   :78.99

標準偏差と不偏分散

sd(body.data$height)

## [1] 8.210723

var(body.data$weight)

## [1] 84.0654

histogram

ggplot(body.data, aes(x=height)) +
  geom_histogram() +
  theme_bw(16) +
  ylab("count")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

histogram by male and female

ggplot(body.data, aes(x=height, fill=gender)) +
  geom_histogram() +
  theme_bw(16) +
  ylab("count")

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

boxplot

ggplot(body.data, aes(x=gender, y=height, fill=gender)) +
  geom_boxplot() +
  theme_bw(16)

point plot with height and weight

ggplot(body.data, aes(x=height, y=weight)) +
  geom_point() +
  theme_bw(16)

point plot and 回帰直線 with height and weight

ggplot(body.data, aes(x=height, y=weight)) +
  geom_point() +
  theme_bw(16) +
  geom_smooth(method = "lm")

男女別point plot and 回帰直線 with height and weight

ggplot(body.data, aes(x=height, y=weight, col=gender)) +
  geom_point() +
  theme_bw(16) +
  geom_smooth(method = "lm")

相関関係

# total
cor(body.data$height, body.data$weight)

## [1] 0.8928748

# male
body.data.m <- body.data[body.data$gender == "M",]
cor(body.data.m$height, body.data.m$weight)

## [1] 0.863457

# female
body.data.f <- body.data[body.data$gender == "F",]
cor(body.data.f$height, body.data.f$weight)

## [1] 0.9173599

仮想の投資と売上データの散布図

amount1.data <- read.csv("amount1.csv")
head(amount1.data)

##     amount    invest
## 1 296.6501 132.23378
## 2 268.5920 107.81794
## 3 304.8194  90.57184
## 4 263.6498  77.51110
## 5 246.7538  74.13405
## 6 309.6545 161.44121

summary(amount1.data)

##      amount          invest      
##  Min.   :220.2   Min.   : 37.02  
##  1st Qu.:279.4   1st Qu.:103.92  
##  Median :302.0   Median :145.05  
##  Mean   :298.1   Mean   :150.01  
##  3rd Qu.:320.8   3rd Qu.:196.05  
##  Max.   :354.5   Max.   :258.54

ggplot(amount1.data, aes(x=invest, y=amount)) +
  geom_point() +
  theme_bw(16)

線形回帰モデルの構築

amount1.lm1 <- lm(amount~invest, data=amount1.data)
summary(amount1.lm1)

## 
## Call:
## lm(formula = amount ~ invest, data = amount1.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -31.521  -8.274   0.322   8.700  36.961 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 229.81293    2.56174   89.71   <2e-16 ***
## invest        0.45554    0.01593   28.60   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.07 on 198 degrees of freedom
## Multiple R-squared:  0.8051, Adjusted R-squared:  0.8042 
## F-statistic: 818.1 on 1 and 198 DF,  p-value: < 2.2e-16

逓減型回帰モデル

ggplot(amount1.data, aes(x=invest, y=amount)) +
  geom_point() +
  theme_bw(16) +
  geom_smooth(method = "lm", formula = y ~ log(x))

残差の分布

plot(amount1.lm1, which=1)

逓減型回帰モデル構築

amount1.lm2 <- lm(amount~log(invest), data=amount1.data)
summary(amount1.lm2)

## 
## Call:
## lm(formula = amount ~ log(invest), data = amount1.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -35.328  -7.181  -0.875   7.540  32.365 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   -5.005      9.436   -0.53    0.596    
## log(invest)   61.574      1.909   32.26   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.84 on 198 degrees of freedom
## Multiple R-squared:  0.8401, Adjusted R-squared:  0.8393 
## F-statistic:  1040 on 1 and 198 DF,  p-value: < 2.2e-16

逓減型回帰モデルでの残差

plot(amount1.lm2, which=1)

ロジスティック回帰モデル

z <- data.frame(Titanic)
titanic.data <- data.frame(
  Class=rep(z$Class, z$Freq),
  Sex=rep(z$Sex, z$Freq),
  Age=rep(z$Age, z$Freq),
  Survived=rep(z$Survived, z$Freq)
)
titanic.logit <- glm(Survived~., data=titanic.data, family = binomial)
summary(titanic.logit)

## 
## Call:
## glm(formula = Survived ~ ., family = binomial, data = titanic.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.0812  -0.7149  -0.6656   0.6858   2.1278  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   0.6853     0.2730   2.510   0.0121 *  
## Class2nd     -1.0181     0.1960  -5.194 2.05e-07 ***
## Class3rd     -1.7778     0.1716 -10.362  < 2e-16 ***
## ClassCrew    -0.8577     0.1573  -5.451 5.00e-08 ***
## SexFemale     2.4201     0.1404  17.236  < 2e-16 ***
## AgeAdult     -1.0615     0.2440  -4.350 1.36e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2769.5  on 2200  degrees of freedom
## Residual deviance: 2210.1  on 2195  degrees of freedom
## AIC: 2222.1
## 
## Number of Fisher Scoring iterations: 4

epicalcパッケージの読み込みとオッズの算出

#install.packages("devtools")
library(devtools)

## Warning: package 'devtools' was built under R version 3.4.3

#install_github("cran/epicalc")
library(epicalc)

## Loading required package: foreign

## Loading required package: survival

## Loading required package: MASS

## Loading required package: nnet

## 
## Attaching package: 'epicalc'

## The following object is masked from 'package:ggplot2':
## 
##     alpha

logistic.display(titanic.logit, simplified = T)

##  
##                   OR lower95ci  upper95ci     Pr(>|Z|)
## Class2nd   0.3612825 0.2460447  0.5304933 2.053331e-07
## Class3rd   0.1690159 0.1207510  0.2365727 3.691891e-25
## ClassCrew  0.4241466 0.3115940  0.5773549 5.004592e-08
## SexFemale 11.2465380 8.5408719 14.8093331 1.431830e-66
## AgeAdult   0.3459219 0.2144193  0.5580746 1.360490e-05

決定木モデル

#install.packages("partykit")
library(rpart)
library(partykit)

## Warning: package 'partykit' was built under R version 3.4.3

## Loading required package: grid

## Loading required package: libcoin

## Warning: package 'libcoin' was built under R version 3.4.3

## Loading required package: mvtnorm

titanic.rp <- rpart(Survived~., data = titanic.data)
plot(as.party(titanic.rp), tp_args = T)

主成分分析

state.pca <- prcomp(state.x77[, 1:6], scale = T)
biplot(state.pca)

MDS

hdist <- read.table("HokkaidoCitiesMDS.tsv", header = F)
head(hdist)

##    V1  V2  V3  V4  V5  V6  V7  V8
## 1   0 115 274 249 152  88 152  30
## 2 115   0 202 188 118 200 263 130
## 3 274 202   0 358 313 360 413 266
## 4 249 188 358   0  98 290 330 277
## 5 152 118 313  98   0 195 240 180
## 6  88 200 360 290 195   0  64  98

hcities <- c("札幌","旭川","稚内市","釧路市","帯広市","室蘭市","函館","小樽")
names(hdist) <- hcities
head(hdist)

##   札幌 旭川 稚内市 釧路市 帯広市 室蘭市 函館 小樽
## 1    0  115    274    249    152     88  152   30
## 2  115    0    202    188    118    200  263  130
## 3  274  202      0    358    313    360  413  266
## 4  249  188    358      0     98    290  330  277
## 5  152  118    313     98      0    195  240  180
## 6   88  200    360    290    195      0   64   98

rownames(hdist) <- hcities
head(hdist)

##        札幌 旭川 稚内市 釧路市 帯広市 室蘭市 函館 小樽
## 札幌      0  115    274    249    152     88  152   30
## 旭川    115    0    202    188    118    200  263  130
## 稚内市  274  202      0    358    313    360  413  266
## 釧路市  249  188    358      0     98    290  330  277
## 帯広市  152  118    313     98      0    195  240  180
## 室蘭市   88  200    360    290    195      0   64   98

hdist.cmd <- cmdscale(hdist)
head(hdist.cmd)

##              [,1]        [,2]
## 札幌     32.93767  -39.528695
## 旭川    -74.44872    7.516849
## 稚内市 -226.74991 -119.948212
## 釧路市  -53.59358  193.147210
## 帯広市  -10.26623  105.270465
## 室蘭市  122.52705  -37.108496

class(hdist.cmd)

## [1] "matrix"

hdist.cmd.df <- as.data.frame(hdist.cmd)
head(hdist.cmd.df)

##                V1          V2
## 札幌     32.93767  -39.528695
## 旭川    -74.44872    7.516849
## 稚内市 -226.74991 -119.948212
## 釧路市  -53.59358  193.147210
## 帯広市  -10.26623  105.270465
## 室蘭市  122.52705  -37.108496

class(hdist.cmd.df)

## [1] "data.frame"

hdist.cmd.df$city <- rownames(hdist.cmd.df)
head(hdist.cmd.df)

##                V1          V2   city
## 札幌     32.93767  -39.528695   札幌
## 旭川    -74.44872    7.516849   旭川
## 稚内市 -226.74991 -119.948212 稚内市
## 釧路市  -53.59358  193.147210 釧路市
## 帯広市  -10.26623  105.270465 帯広市
## 室蘭市  122.52705  -37.108496 室蘭市

names(hdist.cmd.df) <- c("x","y","city")
head(hdist.cmd.df)

##                 x           y   city
## 札幌     32.93767  -39.528695   札幌
## 旭川    -74.44872    7.516849   旭川
## 稚内市 -226.74991 -119.948212 稚内市
## 釧路市  -53.59358  193.147210 釧路市
## 帯広市  -10.26623  105.270465 帯広市
## 室蘭市  122.52705  -37.108496 室蘭市

ggplot(hdist.cmd.df, aes(x=-x, y=-y, label=city)) +
  geom_text() +
  theme_bw(16)

k-means

library(ggplot2)
state.km <-  kmeans(scale(state.x77[,1:6]),3)
# 主成分分析の結果にクラスターの情報を付与する
state.pca.df <- data.frame(state.pca$x)
state.pca.df$name <- rownames(state.pca.df)
state.pca.df$cluster <- as.factor(state.km$cluster)
# 表示
ggplot(state.pca.df, aes(x=PC1,y=PC2,label=name,col=cluster)) +
  geom_text() +
  theme_bw(16)

radar chart

#install.packages("fmsb")
library(fmsb)
df <- as.data.frame(scale(state.km$centers))
dfmax <- apply(df,2,max)+1
dfmax

## Population     Income Illiteracy   Life Exp     Murder    HS Grad 
##   2.089196   1.859270   2.125864   2.056296   1.859310   1.750673

dfmin <- apply(df,2,min)-1
dfmin

## Population     Income Illiteracy   Life Exp     Murder    HS Grad 
##  -1.876620  -2.097648  -1.785020  -1.932103  -2.097630  -2.135184

df <- rbind(dfmax,dfmin,df)
df

##    Population     Income Illiteracy   Life Exp     Murder    HS Grad
## 1   2.0891961  1.8592702  2.1258639  2.0562955  1.8593096  1.7506728
## 2  -1.8766204 -2.0976478 -1.7850195 -1.9321030 -2.0976295 -2.1351836
## 11 -0.2125757 -1.0976478  1.1258639 -0.9321030  0.8593096 -1.1351836
## 21  1.0891961  0.8592702 -0.3408443 -0.1241926  0.2383199  0.3845108
## 3  -0.8766204  0.2383776 -0.7850195  1.0562955 -1.0976295  0.7506728

radarchart(df,seg=5,plty=1,pcol=rainbow(3))
legend("topright", legend=1:3, col=rainbow(3), lty=1)

datascientist_20180118

Rの導入

read file

summary

標準偏差と不偏分散

histogram

histogram by male and female

boxplot

point plot with height and weight

point plot and 回帰直線 with height and weight

男女別point plot and 回帰直線 with height and weight

相関関係

仮想の投資と売上データの散布図

線形回帰モデルの構築

逓減型回帰モデル

残差の分布

逓減型回帰モデル構築

逓減型回帰モデルでの残差

ロジスティック回帰モデル

epicalcパッケージの読み込みとオッズの算出

決定木モデル

主成分分析

MDS

k-means

radar chart

SVM and RandomForest by caret