2.1 サンプルデータの紹介

# install.packages("ISLR")
library(ISLR)

# クレジットカード顧客データ、10000行のサンプル、4変数
Default
# If you cannot find the function “as_tibble”, you must install either the tidyverse package or the tibble package. It will resolve the issue.
library(tidyverse)
# tibble 形のデータフレームに変換する
DEFAULT <- as_tibble(Default)
head(DEFAULT,n=10)

データフレーム

  • 行:一人一人、一つ一つの構成単位
  • 列:変数・属性、一人一人のサンプルの特徴、異なる変数は異なる列に格納

2.2 変数

変数列名の確認と変更

# データフレームの列名の確認
colnames(DEFAULT)
[1] "default" "student" "balance" "income" 
# データフレームの列名の確認
str(DEFAULT)
tibble [10,000 × 4] (S3: tbl_df/tbl/data.frame)
 $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
 $ balance: num [1:10000] 730 817 1074 529 786 ...
 $ income : num [1:10000] 44362 12106 31767 35704 38463 ...
# 列名変更
colnames(DEFAULT) <- c("d","s","b","i")
colnames(DEFAULT)
[1] "d" "s" "b" "i"
# 列名変更 dyl パッケージ利用
DEFAULT <- rename(DEFAULT, #対象データフレーム,
                  default = d, # [新列名 = 旧列名] 修正必要な列のみ
                  student = s)
colnames(DEFAULT)
[1] "default" "student" "b"       "i"      

変数型の確認と変換

変数のタイプは: 1. 量的変数(数量を表す):連続変数(長さ、重さ、時間)、離散変数(件数、個数) 2. カテゴリー変数(質的変数):数量を表すものでない(年代、性別、所属)

よくある変数の型:

データ型 記号 使用例
数値型 numeric 1,1.1,2 
整数型 integer 1,2,3 
実数型 double 1.345 
文字列型 character “営業部”,“人事部”… 
因子型 factor 営業部,人事部… 
論理値型 logical TRUE(1),FALSE(0)… 
日付型 date 2020-01-01… 
# Factor、numなどがデータ型です
str(DEFAULT)
tibble [10,000 × 4] (S3: tbl_df/tbl/data.frame)
 $ default: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
 $ student: Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
 $ b      : num [1:10000] 730 817 1074 529 786 ...
 $ i      : num [1:10000] 44362 12106 31767 35704 38463 ...
# is.xxx(変数):変数型の判断
num <- c(1,2,3)
is.factor(num)
[1] FALSE
# is.xxx(変数):変数型の判断
string_data <- c("1","2","3")
is.character(string_data)
[1] TRUE
setwd("C:/Users/jun.li/OneDrive - UiPath/00.Study/00.DataSciense/R/Udemy")
# idcflDが各データ型の頭文字 integer→i、Date→D
HAKO <- read_csv(file = "testdata.csv",col_types = "iiii",
                 locale = locale(encoding = "UTF-8"))
HAKO
# データフレーム名$列名 <- as.XXX(データフレーム名$列名)
# XXXがデータ型(numeric,integer,doubleなど)
HAKO$番号<- as.character(HAKO$番号)
str(HAKO)
spc_tbl_ [1,000 × 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ 番号: chr [1:1000] "1" "2" "3" "4" ...
 $ 列A : int [1:1000] 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 ...
 $ 列B : int [1:1000] 2001 2002 2003 2004 2005 2006 2007 2008 2009 2010 ...
 $ 列C : int [1:1000] 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 ...
 - attr(*, "spec")=
  .. cols(
  ..   番号 = col_integer(),
  ..   列A = col_integer(),
  ..   列B = col_integer(),
  ..   列C = col_integer()
  .. )
 - attr(*, "problems")=<externalptr> 

データフレーム$列名で列を抽出できます

head(HAKO$番号,10)
 [1] "1"  "2"  "3"  "4"  "5"  "6"  "7"  "8"  "9"  "10"

2.3 演算

基本演算子

記号 意味 使用例
^ べき乗 22,25,2+10^2  
: +-1の等差数列 1:10,10:-10  
*,/ 掛け算、割り算 1*2,3-2/6  
+,- 足し算、引き算 1+2,3-1/2  
<,<= 大小比較 0<1  
==,!= 等しい、等しくない 1==1,“a”!=“b”  
&, | かつ、または 1>0&2>3,1>0|2>3
5:-5
 [1]  5  4  3  2  1  0 -1 -2 -3 -4 -5
1==1
[1] TRUE

行数、列数

# データフレームのの行数と列数の確認
dim(HAKO)
[1] 1000    4
length(HAKO$番号)
[1] 1000

変数の代表値

関数 意味
mean(データフレーム$列名) 平均
median(データフレーム$列名) 中央値
min(データフレーム$列名) 最小値
max(データフレーム$列名) 最大値
sum(データフレーム$列名) 合計
table(データフレーム$列名) 集計
summary(データフレーム) 各変数の要約
# 以前修正した列名を元に戻す
 DEFAULT <- rename(DEFAULT, #対象データフレーム,
                  balance = b, # [新列名 = 旧列名] 修正必要な列のみ
                  income = i)

#構造確認
#str(DEFAULT)
summary(DEFAULT)
 default    student       balance           income     
 No :9667   No :7056   Min.   :   0.0   Min.   :  772  
 Yes: 333   Yes:2944   1st Qu.: 481.7   1st Qu.:21340  
                       Median : 823.6   Median :34553  
                       Mean   : 835.4   Mean   :33517  
                       3rd Qu.:1166.3   3rd Qu.:43808  
                       Max.   :2654.3   Max.   :73554  
median(DEFAULT$balance)
 警告:  Unknown or uninitialised column: `balance`.
NULL
table(DEFAULT$student)

  No  Yes 
7056 2944 

変数の散布度(ばらつき度合い)

変数(列)の散布度の確認: 1. var(データフレーム\(列名):不偏分散 2. sd(データフレーム\)列名):標準偏差

var(DEFAULT$income) #不偏分散 
[1] 177865955
sd(DEFAULT$income) #標準偏差
[1] 13336.64
# 上記の不偏分散を手動計算
n <- length(DEFAULT$income)
m <- mean(DEFAULT$income)
var <- sum((DEFAULT$income-m)^2) / (n-1)
var
[1] 177865955

変数の演算

  1. 変数と一つ値の演算:変数の全ての値に一つの値が演算される
  2. 変数と変数の演算(値の数が等しい):順番通りに演算される
  3. 変数と変数の演算(値の数が異なる場合):順番通りに演算され、値の数が少ない変数は繰り返されて演算される
c1 <- c(1,2,3)
c2 <- c(2,4)
c3 <- c(7)
c4 <- c(10,20,30)

c1 + c3 #case1
[1]  8  9 10
c1 * c4 #case2
[1] 10 40 90
c1 + c2 #case3
 警告:  longer object length is not a multiple of shorter object length
[1] 3 6 5
LS0tDQp0aXRsZTogIlIvUlN0dWRpbygyKS3jg4fjg7zjgr/jga7mpoLoprMiDQpvdXRwdXQ6IA0KICBodG1sX25vdGVib29rOg0KICAgIHRvYzogdHJ1ZQ0KZWRpdG9yX29wdGlvbnM6IA0KICBtYXJrZG93bjogDQogICAgd3JhcDogNzINCi0tLQ0KDQojIDIuMSDjgrXjg7Pjg5fjg6vjg4fjg7zjgr/jga7ntLnku4sNCg0KYGBge3J9DQojIGluc3RhbGwucGFja2FnZXMoIklTTFIiKQ0KbGlicmFyeShJU0xSKQ0KbGlicmFyeSh0aWR5dmVyc2UpDQojIOOCr+ODrOOCuOODg+ODiOOCq+ODvOODiemhp+WuouODh+ODvOOCv+OAgTEwMDAw6KGM44Gu44K144Oz44OX44Or44CB77yU5aSJ5pWwDQpEZWZhdWx0DQpgYGANCg0KYGBge3J9DQojIElmIHlvdSBjYW5ub3QgZmluZCB0aGUgZnVuY3Rpb24g4oCcYXNfdGliYmxl4oCdLCB5b3UgbXVzdCBpbnN0YWxsIGVpdGhlciB0aGUgdGlkeXZlcnNlIHBhY2thZ2Ugb3IgdGhlIHRpYmJsZSBwYWNrYWdlLiBJdCB3aWxsIHJlc29sdmUgdGhlIGlzc3VlLg0KbGlicmFyeSh0aWR5dmVyc2UpDQojIHRpYmJsZSDlvaLjga7jg4fjg7zjgr/jg5Xjg6zjg7zjg6DjgavlpInmj5vjgZnjgosNCkRFRkFVTFQgPC0gYXNfdGliYmxlKERlZmF1bHQpDQpoZWFkKERFRkFVTFQsbj0xMCkNCmBgYA0KDQojIyDjg4fjg7zjgr/jg5Xjg6zjg7zjg6ANCg0KLSAgIOihjO+8muS4gOS6uuS4gOS6uuOAgeS4gOOBpOS4gOOBpOOBruani+aIkOWNmOS9jQ0KLSAgIOWIl++8muWkieaVsOODu+WxnuaAp+OAgeS4gOS6uuS4gOS6uuOBruOCteODs+ODl+ODq+OBrueJueW+tOOAgeeVsOOBquOCi+WkieaVsOOBr+eVsOOBquOCi+WIl+OBq+agvOe0jQ0KDQojIDIuMiDlpInmlbANCg0KIyMg5aSJ5pWw5YiX5ZCN44Gu56K66KqN44Go5aSJ5pu0DQoNCmBgYHtyfQ0KIyDjg4fjg7zjgr/jg5Xjg6zjg7zjg6Djga7liJflkI3jga7norroqo0NCmNvbG5hbWVzKERFRkFVTFQpDQpgYGANCg0KYGBge3J9DQojIOODh+ODvOOCv+ODleODrOODvOODoOOBruWIl+WQjeOBrueiuuiqjQ0Kc3RyKERFRkFVTFQpDQpgYGANCg0KYGBge3J9DQojIOWIl+WQjeWkieabtA0KY29sbmFtZXMoREVGQVVMVCkgPC0gYygiZCIsInMiLCJiIiwiaSIpDQpjb2xuYW1lcyhERUZBVUxUKQ0KYGBgDQoNCmBgYHtyfQ0KIyDliJflkI3lpInmm7QgZHlsIOODkeODg+OCseODvOOCuOWIqeeUqA0KREVGQVVMVCA8LSByZW5hbWUoREVGQVVMVCwgI+WvvuixoeODh+ODvOOCv+ODleODrOODvOODoCwNCiAgICAgICAgICAgICAgICAgIGRlZmF1bHQgPSBkLCAjIFvmlrDliJflkI0gPSDml6fliJflkI1dIOS/ruato+W/heimgeOBquWIl+OBruOBvw0KICAgICAgICAgICAgICAgICAgc3R1ZGVudCA9IHMpDQpjb2xuYW1lcyhERUZBVUxUKQ0KYGBgDQoNCiMjIOWkieaVsOWei+OBrueiuuiqjeOBqOWkieaPmw0KDQrlpInmlbDjga7jgr/jgqTjg5fjga/vvJoNCjEuIOmHj+eahOWkieaVsCjmlbDph4/jgpLooajjgZkp77ya6YCj57aa5aSJ5pWwKOmVt+OBleOAgemHjeOBleOAgeaZgumWkynjgIHpm6LmlaPlpInmlbAo5Lu25pWw44CB5YCL5pWwKQ0KMi4g44Kr44OG44K044Oq44O85aSJ5pWwKOizqueahOWkieaVsCnvvJrmlbDph4/jgpLooajjgZnjgoLjga7jgafjgarjgYTvvIjlubTku6PjgIHmgKfliKXjgIHmiYDlsZ7vvIkNCg0K44KI44GP44GC44KL5aSJ5pWw44Gu5Z6L77yaDQoNCnwg44OH44O844K/5Z6LIHwg6KiY5Y+3ICAgICAgICAgICAgfCAgICAgICAgIOS9v+eUqOS+iyAgICAgICAgIHwNCnw6LS0tLS0tLS0tfDotLS0tLS0tLS0tLS0tLS0tfDotLS0tLS0tLS0tLS0tLS0tLS0tLS0tOnwNCnwg5pWw5YCk5Z6LICAgfCBudW1lcmljICAgICAgICAgfCAgICAgICAxLDEuMSwy44CAICAgICAgICB8DQp8IOaVtOaVsOWeiyAgIHwgaW50ZWdlcjxpbnQ+ICAgIHwgICAgICAgIDEsMiwz44CAICAgICAgICAgfA0KfCDlrp/mlbDlnosgICB8IGRvdWJsZTxkYmw+ICAgICB8ICAgICAgICAxLjM0NeOAgCAgICAgICAgIHwNCnwg5paH5a2X5YiX5Z6LIHwgY2hhcmFjdGVyPGNoYXI+IHwgIuWWtualremDqCIsIuS6uuS6i+mDqCIuLi7jgIAgfA0KfCDlm6DlrZDlnosgICB8IGZhY3RvcjxmY3Q+ICAgICB8ICAg5Za25qWt6YOoLOS6uuS6i+mDqC4uLuOAgCAgIHwNCnwg6KuW55CG5YCk5Z6LIHwgbG9naWNhbDxsZ2w+ICAgIHwgVFJVRSgxKSxGQUxTRSgwKS4uLuOAgCAgfA0KfCDml6Xku5jlnosgICB8IGRhdGU8ZGF0ZT4gICAgICB8ICAgIDIwMjAtMDEtMDEuLi7jgIAgICAgIHwNCg0KYGBge3J9DQojIEZhY3RvcuOAgW51beOBquOBqeOBjOODh+ODvOOCv+Wei+OBp+OBmQ0Kc3RyKERFRkFVTFQpDQpgYGANCg0KYGBge3J9DQojIGlzLnh4eCjlpInmlbAp77ya5aSJ5pWw5Z6L44Gu5Yik5patDQpudW0gPC0gYygxLDIsMykNCmlzLmZhY3RvcihudW0pDQpgYGANCg0KYGBge3J9DQojIGlzLnh4eCjlpInmlbAp77ya5aSJ5pWw5Z6L44Gu5Yik5patDQpzdHJpbmdfZGF0YSA8LSBjKCIxIiwiMiIsIjMiKQ0KaXMuY2hhcmFjdGVyKHN0cmluZ19kYXRhKQ0KYGBgDQoNCmBgYHtyfQ0Kc2V0d2QoIkM6L1VzZXJzL2p1bi5saS9PbmVEcml2ZSAtIFVpUGF0aC8wMC5TdHVkeS8wMC5EYXRhU2NpZW5zZS9SL1VkZW15IikNCiMgaWRjZmxE44GM5ZCE44OH44O844K/5Z6L44Gu6aCt5paH5a2XIGludGVnZXLihpJp44CBRGF0ZeKGkkQNCkhBS08gPC0gcmVhZF9jc3YoZmlsZSA9ICJ0ZXN0ZGF0YS5jc3YiLGNvbF90eXBlcyA9ICJpaWlpIiwNCiAgICAgICAgICAgICAgICAgbG9jYWxlID0gbG9jYWxlKGVuY29kaW5nID0gIlVURi04IikpDQpIQUtPDQpgYGANCg0KYGBge3J9DQojIOODh+ODvOOCv+ODleODrOODvOODoOWQjSTliJflkI0gPC0gYXMuWFhYKOODh+ODvOOCv+ODleODrOODvOODoOWQjSTliJflkI0pDQojIFhYWOOBjOODh+ODvOOCv+WeiyhudW1lcmljLGludGVnZXIsZG91Ymxl44Gq44GpKQ0KSEFLTyTnlarlj7c8LSBhcy5jaGFyYWN0ZXIoSEFLTyTnlarlj7cpDQpzdHIoSEFLTykNCmBgYA0KDQrjg4fjg7zjgr/jg5Xjg6zjg7zjg6BcJOWIl+WQjeOBp+WIl+OCkuaKveWHuuOBp+OBjeOBvuOBmQ0KDQpgYGB7cn0NCmhlYWQoSEFLTyTnlarlj7csMTApDQpgYGANCg0KIyAyLjMg5ryU566XDQoNCiMjIOWfuuacrOa8lOeul+WtkA0KDQp8IOiomOWPtyAgIHwg5oSP5ZGzICAgICAgICAgICAgICAgfCAgICAgICAg5L2/55So5L6LICAgICAgICB8DQp8Oi0tLS0tLS18Oi0tLS0tLS0tLS0tLS0tLS0tLS18Oi0tLS0tLS0tLS0tLS0tLS0tLS0tOnwNCnwgXF4gICAgIHwg44G544GN5LmXICAgICAgICAgICAgIHwgIDJeMiwyXjUsMisxMFxeMiDjgIAgIHwNCnwgOiAgICAgIHwgKy0x44Gu562J5beu5pWw5YiXICAgICAgfCAgICAxOjEwLDEwOi0xMCDjgIAgICAgfA0KfCBcKiwvICAgfCDmjpvjgZHnrpfjgIHlibLjgornrpcgICAgIHwgICAgMVwqMiwzLTIvNiDjgIAgICAgIHwNCnwgKywtICAgIHwg6Laz44GX566X44CB5byV44GN566XICAgICB8ICAgICAxKzIsMy0xLzIg44CAICAgICB8DQp8IFw8LFw8PSB8IOWkp+Wwj+avlOi8gyAgICAgICAgICAgfCAgICAgICAwXDwxIOOAgCAgICAgICAgfA0KfCA9PSwhPSAgfCDnrYnjgZfjgYTjgIHnrYnjgZfjgY/jgarjgYQgfCAgIDE9PTEsImEiIT0iYiIg44CAICAgfA0KfCAmLCBcfCAgfCDjgYvjgaTjgIHjgb7jgZ/jga8gICAgICAgfCAxXD4wJjJcPjMsMVw+MFx8Mlw+MyB8DQoNCmBgYHtyfQ0KNTotNQ0KYGBgDQoNCmBgYHtyfQ0KMT09MQ0KYGBgDQoNCiMjIOihjOaVsOOAgeWIl+aVsA0KDQpgYGB7cn0NCiMg44OH44O844K/44OV44Os44O844Og44Gu44Gu6KGM5pWw44Go5YiX5pWw44Gu56K66KqNDQpkaW0oSEFLTykNCmxlbmd0aChIQUtPJOeVquWPtykNCmBgYA0KDQojIyDlpInmlbDjga7ku6PooajlgKQNCg0KfCDplqLmlbAgICAgICAgfCDmhI/lkbMgICAgICB8DQp8Oi0tLS0tLS0tLS0tLS0tLS0tLS0tfDotLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLXwNCnwgbWVhbijjg4fjg7zjgr/jg5Xjg6zjg7zjg6BcJOWIl+WQjSkgfCDlubPlnYcgfA0KfCBtZWRpYW4o44OH44O844K/44OV44Os44O844OgXCTliJflkI0pIHwg5Lit5aSu5YCkIHwNCnwgbWluKOODh+ODvOOCv+ODleODrOODvOODoFwk5YiX5ZCNKSB8IOacgOWwj+WApCB8IA0KfCBtYXgo44OH44O844K/44OV44Os44O844OgXCTliJflkI0pICAgfCDmnIDlpKflgKQgfA0KfCBzdW0o44OH44O844K/44OV44Os44O844OgXCTliJflkI0pIHwg5ZCI6KiIIHwgDQp8IHRhYmxlKOODh+ODvOOCv+ODleODrOODvOODoFwk5YiX5ZCNKSAgIHwg6ZuG6KiIICAgICAgfA0KfCBzdW1tYXJ5KOODh+ODvOOCv+ODleODrOODvOODoCkgICAgfCDlkITlpInmlbDjga7opoHntIQgfA0KDQpgYGB7cn0NCiMg5Lul5YmN5L+u5q2j44GX44Gf5YiX5ZCN44KS5YWD44Gr5oi744GZDQogREVGQVVMVCA8LSByZW5hbWUoREVGQVVMVCwgI+WvvuixoeODh+ODvOOCv+ODleODrOODvOODoCwNCiAgICAgICAgICAgICAgICAgIGJhbGFuY2UgPSBiLCAjIFvmlrDliJflkI0gPSDml6fliJflkI1dIOS/ruato+W/heimgeOBquWIl+OBruOBvw0KICAgICAgICAgICAgICAgICAgaW5jb21lID0gaSkNCg0KI+ani+mAoOeiuuiqjQ0KI3N0cihERUZBVUxUKQ0Kc3VtbWFyeShERUZBVUxUKQ0KYGBgDQoNCmBgYHtyfQ0KbWVkaWFuKERFRkFVTFQkYmFsYW5jZSkNCnRhYmxlKERFRkFVTFQkc3R1ZGVudCkNCmBgYA0KDQojIyDlpInmlbDjga7mlaPluIPluqYo44Gw44KJ44Gk44GN5bqm5ZCI44GEKQ0KDQrlpInmlbAo5YiXKeOBruaVo+W4g+W6puOBrueiuuiqje+8mg0KMS4gdmFyKOODh+ODvOOCv+ODleODrOODvOODoCTliJflkI0p77ya5LiN5YGP5YiG5pWjIA0KMi4gc2Qo44OH44O844K/44OV44Os44O844OgJOWIl+WQjSnvvJrmqJnmupblgY/lt64NCg0KYGBge3J9DQp2YXIoREVGQVVMVCRpbmNvbWUpICPkuI3lgY/liIbmlaMgDQpzZChERUZBVUxUJGluY29tZSkgI+aomea6luWBj+W3rg0KYGBgDQoNCmBgYHtyfQ0KIyDkuIroqJjjga7kuI3lgY/liIbmlaPjgpLmiYvli5XoqIjnrpcNCm4gPC0gbGVuZ3RoKERFRkFVTFQkaW5jb21lKQ0KbSA8LSBtZWFuKERFRkFVTFQkaW5jb21lKQ0KdmFyIDwtIHN1bSgoREVGQVVMVCRpbmNvbWUtbSleMikgLyAobi0xKQ0KdmFyDQpgYGANCg0KIyMg5aSJ5pWw44Gu5ryU566XDQoNCjEuIOWkieaVsOOBqOS4gOOBpOWApOOBrua8lOeul++8muWkieaVsOOBruWFqOOBpuOBruWApOOBq+S4gOOBpOOBruWApOOBjOa8lOeul+OBleOCjOOCiw0KMi4g5aSJ5pWw44Go5aSJ5pWw44Gu5ryU566X77yI5YCk44Gu5pWw44GM562J44GX44GE77yJ77ya6aCG55Wq6YCa44KK44Gr5ryU566X44GV44KM44KLDQozLiDlpInmlbDjgajlpInmlbDjga7mvJTnrpfvvIjlgKTjga7mlbDjgYznlbDjgarjgovloLTlkIjvvInvvJrpoIbnlarpgJrjgorjgavmvJTnrpfjgZXjgozjgIHlgKTjga7mlbDjgYzlsJHjgarjgYTlpInmlbDjga/nubDjgorov5TjgZXjgozjgabmvJTnrpfjgZXjgozjgosNCg0KYGBge3J9DQpjMSA8LSBjKDEsMiwzKQ0KYzIgPC0gYygyLDQpDQpjMyA8LSBjKDcpDQpjNCA8LSBjKDEwLDIwLDMwKQ0KDQpjMSArIGMzICNjYXNlMQ0KYzEgKiBjNCAjY2FzZTINCmMxICsgYzIgI2Nhc2UzDQpgYGANCg0KDQo=