readr package

#install.packages("tidyverse")
#install.packages("readr")
library('tidyverse')

## ─ Attaching packages ── tidyverse 1.2.1 ─

## ✔ ggplot2 3.0.0     ✔ purrr   0.2.5
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ readr   1.1.1     ✔ forcats 0.3.0

## ─ Conflicts ──── tidyverse_conflicts() ─
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

setwd('~/lecture/riii')
stock = read_csv('data/2330.csv',col_names=T)

## Parsed with column specification:
## cols(
##   Date = col_date(format = ""),
##   Open = col_double(),
##   High = col_double(),
##   Low = col_double(),
##   Close = col_double(),
##   Volume = col_integer()
## )

read excel file

library(readxl)
setwd('~/lecture/riii/')
FinancialReport <- read_excel("./data/FinancialReport.xlsx")
#View(FinancialReport)
summary(FinancialReport)

##       年度           股本         財報評分          收盤       
##  Min.   :1999   Min.   : 767   Min.   :59.00   Min.   : 42.60  
##  1st Qu.:2003   1st Qu.:2027   1st Qu.:89.00   1st Qu.: 62.50  
##  Median :2007   Median :2583   Median :92.00   Median : 71.00  
##  Mean   :2007   Mean   :2249   Mean   :88.24   Mean   : 83.75  
##  3rd Qu.:2011   3rd Qu.:2592   3rd Qu.:94.00   3rd Qu.: 97.00  
##  Max.   :2015   Max.   :2643   Max.   :96.00   Max.   :167.00  
##       平均             漲跌            漲跌__1          營業收入   
##  Min.   : 52.40   Min.   :-88.500   Min.   :-53.00   Min.   : 731  
##  1st Qu.: 56.40   1st Qu.: -5.500   1st Qu.: -8.10   1st Qu.:2030  
##  Median : 67.40   Median :  6.500   Median :  8.80   Median :3174  
##  Mean   : 82.29   Mean   :  4.235   Mean   : 11.77   Mean   :3576  
##  3rd Qu.:104.00   3rd Qu.: 20.100   3rd Qu.: 28.00   3rd Qu.:4271  
##  Max.   :147.00   Max.   : 96.000   Max.   :135.00   Max.   :8435  
##     營業毛利       營業利益       業外損益         稅後淨利   
##  Min.   : 315   Min.   : 128   Min.   :-43.70   Min.   : 145  
##  1st Qu.: 765   1st Qu.: 613   1st Qu.:  4.97   1st Qu.: 651  
##  Median :1417   Median :1044   Median : 35.00   Median : 999  
##  Mean   :1639   Mean   :1238   Mean   : 50.67   Mean   :1179  
##  3rd Qu.:2071   3rd Qu.:1592   3rd Qu.: 62.10   3rd Qu.:1616  
##  Max.   :4104   Max.   :3200   Max.   :304.00   Max.   :3066  
##       ROA             EPS        
##  Min.   : 3.93   Min.   : 0.830  
##  1st Qu.:15.50   1st Qu.: 3.450  
##  Median :18.40   Median : 4.140  
##  Mean   :17.15   Mean   : 4.969  
##  3rd Qu.:19.40   3rd Qu.: 6.240  
##  Max.   :24.70   Max.   :11.820

read json

setwd('~/lecture/riii/')
download.file('https://www.railway.gov.tw/Upload/UserFiles/%E8%87%BA%E9%90%B5%E5%B1%80%E6%88%BF%E5%9C%B0%E7%94%A2%E5%87%BA%E7%A7%9F%E6%83%85%E5%BD%A2.json', destfile = './data/rent.json')

library(jsonlite)

## 
## Attaching package: 'jsonlite'

## The following object is masked from 'package:purrr':
## 
##     flatten

json_data<- fromJSON('./data/rent.json')

## Warning: JSON string contains (illegal) UTF8 byte-order-mark!

head(json_data)

##     縣市 經管單位                 用途限制 實際用途 每月租金
## 1 臺北市   台北所 辦公或住宅或法律許可範圍     商店    22900
## 2 臺北市   台北所 辦公或住宅或法律許可範圍   辦公室   138000
## 3 臺北市   台北所 辦公或住宅或法律許可範圍     商店    56899
## 4 臺北市   台北所 辦公或住宅或法律許可範圍     住家    23050
## 5 臺北市   台北所 辦公或住宅或法律許可範圍     商店    33294
## 6 臺北市   台北所 辦公或住宅或法律許可範圍     商店    20020
##              租期屆滿 建物面積     構造 總樓層數 建物現況
## 1  105.12.1-108.11.30    189.8 木石磚造        1        B
## 2 104.06.07-107.06.06   220.49     磚造        1        B
## 3     105.2.3-108.2.2      132 加強磚造        2        A
## 4 104.05.28-107.05.27  102.545 加強磚造        3        C
## 5   105.3.31-107.3.30   127.92 加強磚造        1        A
## 6     105.7.2-107.7.1       72 加強磚造        2        B
##                                                                  房屋座落
## 1                     新北市淡水區鼻頭街10、11、12號(淡水鎮海鷗段316地號)
## 2 臺北市大同區忠孝西路2段13號(大同區玉泉段2小段371-7、371-20、371-21地號)
## 3                    臺北市大同區赤峰街33巷4號(大同區圓環段二小段151地號)
## 4                   臺北市萬華區康定路56巷3弄3號(萬華區直興段1小段93地號)
## 5                         臺北市中正區臨沂街19巷16號(中正區臨沂段301地號)
## 6         臺北市中正區汀州路2段73號(臺北市中正區河堤段687-16、687-27地號)
##                                   土地使用分區
## 1                                       工業區
## 2 371-7,371-20為交通廣場用地, 371-21為道路用地
## 3                                 第四種住宅區
## 4                                 第四種商業區
## 5                                 第三種住宅區
## 6               第三種住宅區及第三之一種住宅區

read xml

library(XML)
#url <- 'http://opendata.epa.gov.tw/ws/Data/ATM00698/?$format=xml'
#weather <- xmlToDataFrame(url)
#View(weather)
#weather[ weather$SiteName == '臺北',  'Temperature'   ]

Flow Control

x=5;
if(x>3){
  print("x > 3")
}else{
  print("x <= 3")
}

## [1] "x > 3"

if(x>3) print("x > 3") else print("x <= 3")

## [1] "x > 3"

test = ifelse(x>3,"x > 3","x <= 3")
test

## [1] "x > 3"

x=5;
if(x>3){
  print ("x > 3");
} else if (x ==3){
  print ("x == 3");
}else{
  print("x <= 3");
}

## [1] "x > 3"

switch(2,print("aaa"),print("bbb"),print("ccc"))

## [1] "bbb"

switch("third",first=print("aaa"),second=print("bbb"),third=print("ccc"))

## [1] "ccc"

for(i in 1:10){
  print(i);
}

## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10

sum=0
for(i in 1:100){
  sum= sum+ i;
}
sum

## [1] 5050

sum(1:100)

## [1] 5050

sum = 0;
cnt = 0;
while(cnt <= 100){
  sum = sum + cnt;
  cnt = cnt + 1;
}
sum

## [1] 5050

mat = matrix(1:9, byrow=TRUE, nrow=3)
for(i in 1:nrow(mat)){
  for(j in 1:ncol(mat)){
    print(mat[i,j])
  }
}

## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9

for(i in seq_len(nrow(mat))) {
  for(j in seq_len(ncol(mat))) { 
    print(mat[i, j])
  }
}

## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
## [1] 6
## [1] 7
## [1] 8
## [1] 9

Review seq()

mat = matrix(1:9, byrow=TRUE, nrow=3)
1:3

## [1] 1 2 3

1:nrow(mat)

## [1] 1 2 3

seq(from=1,to=nrow(mat),by=1)

## [1] 1 2 3

seq_len(nrow(mat))

## [1] 1 2 3

break / next

#break(跳出迴圈)
for(n in 1:10){
  if(n==5){
    break
  }
  print(n)
}

## [1] 1
## [1] 2
## [1] 3
## [1] 4

#next(略過本次,類似python continue)
for(n in 1:10){
  if(n==5){
    next
  }
  print(n)
}

## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 6
## [1] 7
## [1] 8
## [1] 9
## [1] 10

p86 example

#method1
mat = matrix(rep(1,9^2),nrow = 9)
#mat = matrix(nrow=9,ncol=9)
for(i in 1:nrow(mat)){
  for(j in 1:ncol(mat)){
    #mat[i,j] = i * j;
    #mat[i,j] = paste(i,"*",j,"= ",i*j)
    mat[i,j] = sprintf(" %s * %s = %s",i,j,i*j)
  }
}

#method2
mat1 = matrix(1:9, nrow = 9);
mat2 = matrix(1:9, nrow = 1);
mat = mat1 %*% mat2;
mat

##       [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
##  [1,]    1    2    3    4    5    6    7    8    9
##  [2,]    2    4    6    8   10   12   14   16   18
##  [3,]    3    6    9   12   15   18   21   24   27
##  [4,]    4    8   12   16   20   24   28   32   36
##  [5,]    5   10   15   20   25   30   35   40   45
##  [6,]    6   12   18   24   30   36   42   48   54
##  [7,]    7   14   21   28   35   42   49   56   63
##  [8,]    8   16   24   32   40   48   56   64   72
##  [9,]    9   18   27   36   45   54   63   72   81

Function

f = function(a){
    return(a+3)
}
f(3)

## [1] 6

#with default arguments
f1 = function(a = 3) {
    return(a+3)
}
f1()

## [1] 6

#lazy function
f2 = function(a, b = 2, c = NULL) {
   return(b + 1)
}
f2()

## [1] 3

#local variable
b = 3
f3 = function(a) {
    b = 2
    return(b)
}
f3()

## [1] 2

f4 = function(a,b){
    if(a > 3){
       a = 100;
    }
    return(a + b);
}
f4(4,1)

## [1] 101

p90 example

match_func = function(filename){
  match = read.table(filename, sep= "|")
  mat = matrix(rep(-1,5^2), nrow=5)
  rownames(mat) = c("A","B","C","D","E")
  colnames(mat) = c("A","B","C","D","E")
  for (i in 1:nrow(match)){
    mat[match[i,1], match[i,2]] = match[i,3];
  }
  return(mat)
}

match_func('~/lecture/riii/data/match.txt')

##    A  B  C  D  E
## A -1  1  3  2  0
## B  2 -1  1  3  0
## C  2  0 -1  1  5
## D  1  1  2 -1  0
## E  1  1  2  3 -1

#general case
match_func = function(filename,header=T,sep='|'){
  t = read.table(filename,header=header,sep = sep);
  mat = matrix(rep(-1,length(levels(t[,1]))^2),
               nrow = length(levels(t[,1])),
               dimnames = list( levels(t[,1]), levels(t[,2] ) ));

  for(i in 1:nrow(t)){
    mat[t[i,1], t[i,2]] = t[i,3];
  }
  return(mat)
}

match_func('~/lecture/riii/data/match.txt',F)

##    A  B  C  D  E
## A -1  1  3  2  0
## B  2 -1  1  3  0
## C  2  0 -1  1  5
## D  1  1  2 -1  0
## E  1  1  2  3 -1

lapply sapply apply tapply

x = list(c(1,2,3,4), c(5,6,7,8))
lapply(x, sum)

## [[1]]
## [1] 10
## 
## [[2]]
## [1] 26

m1 = matrix(1:4, byrow=TRUE, nrow=2)
m2 = matrix(5:8, byrow=TRUE, nrow=2)
li = list(m1, m2)
lapply(li, mean)

## [[1]]
## [1] 2.5
## 
## [[2]]
## [1] 6.5

grades =list(kevin = c(80,60,92), marry = c(56,75,64,84,56), QOO = c(10,20,3,4,10))
lapply(grades, sum)

## $kevin
## [1] 232
## 
## $marry
## [1] 335
## 
## $QOO
## [1] 47

lapply(grades, mean)

## $kevin
## [1] 77.33333
## 
## $marry
## [1] 67
## 
## $QOO
## [1] 9.4

lapply(grades, function(e){list(sum = sum(e), min = min(e))})

## $kevin
## $kevin$sum
## [1] 232
## 
## $kevin$min
## [1] 60
## 
## 
## $marry
## $marry$sum
## [1] 335
## 
## $marry$min
## [1] 56
## 
## 
## $QOO
## $QOO$sum
## [1] 47
## 
## $QOO$min
## [1] 3

class(lapply(grades, sum))

## [1] "list"

sapply(grades, sum)

## kevin marry   QOO 
##   232   335    47

class(sapply(grades, sum))

## [1] "numeric"

sapply(li, mean)

## [1] 2.5 6.5

sapply(li,function(e) e[1,])

##      [,1] [,2]
## [1,]    1    5
## [2,]    2    6

m = matrix(1:4, byrow=TRUE, nrow=2)
apply(m, 1, sum) # rowsums

## [1] 3 7

apply(m, 2, sum) # colsums

## [1] 4 6

rowmeans = apply(m, 1, mean)
colmeans = apply(m, 2, mean)

x = c(80,70,59,88,72,57)
t = c(1,1,2,1,1,2)
tapply(x,t, mean)

##    1    2 
## 77.5 58.0

data(iris)

tapply(iris$Sepal.Length, iris$Species, mean)

##     setosa versicolor  virginica 
##      5.006      5.936      6.588

lapply(names(iris[1:4]),function(e){tapply(iris[,e],iris$Species,mean) })

## [[1]]
##     setosa versicolor  virginica 
##      5.006      5.936      6.588 
## 
## [[2]]
##     setosa versicolor  virginica 
##      3.428      2.770      2.974 
## 
## [[3]]
##     setosa versicolor  virginica 
##      1.462      4.260      5.552 
## 
## [[4]]
##     setosa versicolor  virginica 
##      0.246      1.326      2.026

探索性資料分析

表格

#download file:
#download.file("https://github.com/YuHsuanLin/riii/raw/master/Statistics/cdc.Rdata","~/lecture/riii/Statistics/cdc.Rdata")

#import data
#getwd()
setwd("~/lecture/riii")
load("Statistics/cdc.Rdata")

str(cdc)

## 'data.frame':    20000 obs. of  9 variables:
##  $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 3 2 2 2 2 3 3 ...
##  $ exerany : num  0 0 1 1 0 1 1 0 0 1 ...
##  $ hlthplan: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ smoke100: num  0 1 1 0 0 0 0 0 1 0 ...
##  $ height  : num  70 64 60 66 61 64 71 67 65 70 ...
##  $ weight  : int  175 125 105 132 150 114 194 170 150 180 ...
##  $ wtdesire: int  175 115 105 124 130 114 185 160 130 170 ...
##  $ age     : int  77 33 49 42 55 55 31 45 27 44 ...
##  $ gender  : Factor w/ 2 levels "m","f": 1 2 2 2 2 2 1 1 2 1 ...

head(cdc)

##     genhlth exerany hlthplan smoke100 height weight wtdesire age gender
## 1      good       0        1        0     70    175      175  77      m
## 2      good       0        1        1     64    125      115  33      f
## 3      good       1        1        1     60    105      105  49      f
## 4      good       1        1        0     66    132      124  42      f
## 5 very good       0        1        0     61    150      130  55      f
## 6 very good       1        1        0     64    114      114  55      f

names(cdc)

## [1] "genhlth"  "exerany"  "hlthplan" "smoke100" "height"   "weight"  
## [7] "wtdesire" "age"      "gender"

#轉換資料類型
cdc$exerany = as.factor(cdc$exerany)
cdc$hlthplan = as.factor(cdc$hlthplan)
cdc$smoke100 = as.factor(cdc$smoke100)

str(cdc)

## 'data.frame':    20000 obs. of  9 variables:
##  $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 3 2 2 2 2 3 3 ...
##  $ exerany : Factor w/ 2 levels "0","1": 1 1 2 2 1 2 2 1 1 2 ...
##  $ hlthplan: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ smoke100: Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 1 2 1 ...
##  $ height  : num  70 64 60 66 61 64 71 67 65 70 ...
##  $ weight  : int  175 125 105 132 150 114 194 170 150 180 ...
##  $ wtdesire: int  175 115 105 124 130 114 185 160 130 170 ...
##  $ age     : int  77 33 49 42 55 55 31 45 27 44 ...
##  $ gender  : Factor w/ 2 levels "m","f": 1 2 2 2 2 2 1 1 2 1 ...

#一維次數分配表
table(cdc$exerany)

## 
##     0     1 
##  5086 14914

#相對比例
table(cdc$exerany) / length(cdc$exerany)

## 
##      0      1 
## 0.2543 0.7457

#二維次數分配表
table(cdc$gender,cdc$exerany)

##    
##        0    1
##   m 2149 7420
##   f 2937 7494

#連續型資料作表
table(cdc$height)

## 
##   48   49   50   51   52   53   54   55   56   57   58   59   60   61   62 
##    2    1    1    2    2    7    3    4   17   20   51  170  613  594 1272 
##   63   64   65   66   67   68   69   70   71   72   73   74   75   76   77 
## 1368 1662 1568 1843 1671 1505 1380 1500 1296 1393  784  605  321  189   80 
##   78   79   80   81   82   83   84   93 
##   43   15   10    3    2    1    1    1

summary(cdc$height)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   48.00   64.00   67.00   67.18   70.00   93.00

head(cut(cdc$height,seq(45,95,by=5)))

## [1] (65,70] (60,65] (55,60] (65,70] (60,65] (60,65]
## 10 Levels: (45,50] (50,55] (55,60] (60,65] (65,70] (70,75] ... (90,95]

#包含上界不包含下界
table(cut(cdc$height,seq(45,95,by=5),right=T))

## 
## (45,50] (50,55] (55,60] (60,65] (65,70] (70,75] (75,80] (80,85] (85,90] 
##       4      18     871    6464    7899    4399     337       7       0 
## (90,95] 
##       1

#包含下界不包含上界
table(cut(cdc$height,seq(45,95,by=5),right=F))

## 
## [45,50) [50,55) [55,60) [60,65) [65,70) [70,75) [75,80) [80,85) [85,90) 
##       3      15     262    5509    7967    5578     648      17       0 
## [90,95) 
##       1

## 加上labels(組別名稱)
table(cut(cdc$height,seq(45,95,by=5),right=F,labels=seq(1,length(seq(45,95,by=5))-1)))

## 
##    1    2    3    4    5    6    7    8    9   10 
##    3   15  262 5509 7967 5578  648   17    0    1

## 把分組後資料存在cdc變數的h_group欄位中
cdc$h_group = cut(cdc$height,seq(45,95,by=5),right=F)
str(cdc)

## 'data.frame':    20000 obs. of  10 variables:
##  $ genhlth : Factor w/ 5 levels "excellent","very good",..: 3 3 3 3 2 2 2 2 3 3 ...
##  $ exerany : Factor w/ 2 levels "0","1": 1 1 2 2 1 2 2 1 1 2 ...
##  $ hlthplan: Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ smoke100: Factor w/ 2 levels "0","1": 1 2 2 1 1 1 1 1 2 1 ...
##  $ height  : num  70 64 60 66 61 64 71 67 65 70 ...
##  $ weight  : int  175 125 105 132 150 114 194 170 150 180 ...
##  $ wtdesire: int  175 115 105 124 130 114 185 160 130 170 ...
##  $ age     : int  77 33 49 42 55 55 31 45 27 44 ...
##  $ gender  : Factor w/ 2 levels "m","f": 1 2 2 2 2 2 1 1 2 1 ...
##  $ h_group : Factor w/ 10 levels "[45,50)","[50,55)",..: 6 4 4 5 4 4 6 5 5 6 ...

apply example

apply(table(cdc$exerany,cdc$genhlth),1,function(e){ e/sum(e) })

##            
##                      0          1
##   excellent 0.14982304 0.26116401
##   very good 0.26582776 0.37682714
##   good      0.34034605 0.26444951
##   fair      0.16850177 0.07791337
##   poor      0.07550138 0.01964597

統計量

#集中量數:mean median mode

a = c(100,120,130,110,100,90,80,90,100,110)
sum(a) / 10

## [1] 103

mean(a)

## [1] 103

#有極端值
b = c(a, 10000)
mean(b)

## [1] 1002.727

##  [1] 100 120 130 110 100  90  80  90 100 110

sort(a)

##  [1]  80  90  90 100 100 100 110 110 120 130

median(a)

## [1] 100

sort(b)

##  [1]    80    90    90   100   100   100   110   110   120   130 10000

median(b)

## [1] 100

table(c(1,4,4,3))

## 
## 1 3 4 
## 1 1 2

which.max(table(c(1,4,4,3)))

## 4 
## 3

names(which.max(table(c(1,4,4,3))))

## [1] "4"

mean(cdc$weight)

## [1] 169.683

median(cdc$weight)

## [1] 165

as.integer(names(which.max(table(cdc$weight))))

## [1] 160

#離差量數:range IQR variance stardard deviation

a = c(173,162,150,160,155,168,171,185,175,178,182)
sort(a)

##  [1] 150 155 160 162 168 171 173 175 178 182 185

range(a)

## [1] 150 185

quantile(a,0.5)

## 50% 
## 171

quantile(a,0.25)

## 25% 
## 161

quantile(a,0.75)

##   75% 
## 176.5

quantile(a,0.75) - quantile(a,0.25)

##  75% 
## 15.5

IQR(a)

## [1] 15.5

fivenum(a)

## [1] 150.0 161.0 171.0 176.5 185.0

summary(a)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   150.0   161.0   171.0   169.0   176.5   185.0

boxplot(a,horizontal = T)

b = c(a,226)
range(b)

## [1] 150 226

IQR(b)

## [1] 17.5

boxplot(b,horizontal = T)

#全距
range(cdc$weight)

## [1]  68 500

#四分位距
IQR(cdc$weight)

## [1] 50

#變異數
var(cdc$weight)

## [1] 1606.484

#標準差
sqrt(var(cdc$weight))

## [1] 40.08097

sd(cdc$weight)

## [1] 40.08097

#摘要數據
summary(cdc$weight)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    68.0   140.0   165.0   169.7   190.0   500.0

Covariance & Correlation

x = c(160,170,180)
y = c(64, 68, 72)

#計算共變異數
cov_xy = sum((x - mean(x)) * (y - mean(y))) / 2
cov_xy

## [1] 40

cov(x,y)

## [1] 40

#計算相關係數
cor_xy = cov(x,y) / (sd(x) * sd(y))  
cor_xy

## [1] 1

cor(x,y)

## [1] 1

plot(x,y)

#example1:
data(mtcars)
mtcars

##                      mpg cyl  disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4           21.0   6 160.0 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag       21.0   6 160.0 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710          22.8   4 108.0  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive      21.4   6 258.0 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout   18.7   8 360.0 175 3.15 3.440 17.02  0  0    3    2
## Valiant             18.1   6 225.0 105 2.76 3.460 20.22  1  0    3    1
## Duster 360          14.3   8 360.0 245 3.21 3.570 15.84  0  0    3    4
## Merc 240D           24.4   4 146.7  62 3.69 3.190 20.00  1  0    4    2
## Merc 230            22.8   4 140.8  95 3.92 3.150 22.90  1  0    4    2
## Merc 280            19.2   6 167.6 123 3.92 3.440 18.30  1  0    4    4
## Merc 280C           17.8   6 167.6 123 3.92 3.440 18.90  1  0    4    4
## Merc 450SE          16.4   8 275.8 180 3.07 4.070 17.40  0  0    3    3
## Merc 450SL          17.3   8 275.8 180 3.07 3.730 17.60  0  0    3    3
## Merc 450SLC         15.2   8 275.8 180 3.07 3.780 18.00  0  0    3    3
## Cadillac Fleetwood  10.4   8 472.0 205 2.93 5.250 17.98  0  0    3    4
## Lincoln Continental 10.4   8 460.0 215 3.00 5.424 17.82  0  0    3    4
## Chrysler Imperial   14.7   8 440.0 230 3.23 5.345 17.42  0  0    3    4
## Fiat 128            32.4   4  78.7  66 4.08 2.200 19.47  1  1    4    1
## Honda Civic         30.4   4  75.7  52 4.93 1.615 18.52  1  1    4    2
## Toyota Corolla      33.9   4  71.1  65 4.22 1.835 19.90  1  1    4    1
## Toyota Corona       21.5   4 120.1  97 3.70 2.465 20.01  1  0    3    1
## Dodge Challenger    15.5   8 318.0 150 2.76 3.520 16.87  0  0    3    2
## AMC Javelin         15.2   8 304.0 150 3.15 3.435 17.30  0  0    3    2
## Camaro Z28          13.3   8 350.0 245 3.73 3.840 15.41  0  0    3    4
## Pontiac Firebird    19.2   8 400.0 175 3.08 3.845 17.05  0  0    3    2
## Fiat X1-9           27.3   4  79.0  66 4.08 1.935 18.90  1  1    4    1
## Porsche 914-2       26.0   4 120.3  91 4.43 2.140 16.70  0  1    5    2
## Lotus Europa        30.4   4  95.1 113 3.77 1.513 16.90  1  1    5    2
## Ford Pantera L      15.8   8 351.0 264 4.22 3.170 14.50  0  1    5    4
## Ferrari Dino        19.7   6 145.0 175 3.62 2.770 15.50  0  1    5    6
## Maserati Bora       15.0   8 301.0 335 3.54 3.570 14.60  0  1    5    8
## Volvo 142E          21.4   4 121.0 109 4.11 2.780 18.60  1  1    4    2

cov(mtcars)

##              mpg         cyl        disp          hp         drat
## mpg    36.324103  -9.1723790  -633.09721 -320.732056   2.19506351
## cyl    -9.172379   3.1895161   199.66028  101.931452  -0.66836694
## disp -633.097208 199.6602823 15360.79983 6721.158669 -47.06401915
## hp   -320.732056 101.9314516  6721.15867 4700.866935 -16.45110887
## drat    2.195064  -0.6683669   -47.06402  -16.451109   0.28588135
## wt     -5.116685   1.3673710   107.68420   44.192661  -0.37272073
## qsec    4.509149  -1.8868548   -96.05168  -86.770081   0.08714073
## vs      2.017137  -0.7298387   -44.37762  -24.987903   0.11864919
## am      1.803931  -0.4657258   -36.56401   -8.320565   0.19015121
## gear    2.135685  -0.6491935   -50.80262   -6.358871   0.27598790
## carb   -5.363105   1.5201613    79.06875   83.036290  -0.07840726
##               wt         qsec           vs           am        gear
## mpg   -5.1166847   4.50914919   2.01713710   1.80393145   2.1356855
## cyl    1.3673710  -1.88685484  -0.72983871  -0.46572581  -0.6491935
## disp 107.6842040 -96.05168145 -44.37762097 -36.56401210 -50.8026210
## hp    44.1926613 -86.77008065 -24.98790323  -8.32056452  -6.3588710
## drat  -0.3727207   0.08714073   0.11864919   0.19015121   0.2759879
## wt     0.9573790  -0.30548161  -0.27366129  -0.33810484  -0.4210806
## qsec  -0.3054816   3.19316613   0.67056452  -0.20495968  -0.2804032
## vs    -0.2736613   0.67056452   0.25403226   0.04233871   0.0766129
## am    -0.3381048  -0.20495968   0.04233871   0.24899194   0.2923387
## gear  -0.4210806  -0.28040323   0.07661290   0.29233871   0.5443548
## carb   0.6757903  -1.89411290  -0.46370968   0.04637097   0.3266129
##             carb
## mpg  -5.36310484
## cyl   1.52016129
## disp 79.06875000
## hp   83.03629032
## drat -0.07840726
## wt    0.67579032
## qsec -1.89411290
## vs   -0.46370968
## am    0.04637097
## gear  0.32661290
## carb  2.60887097

cor(mtcars)

##             mpg        cyl       disp         hp        drat         wt
## mpg   1.0000000 -0.8521620 -0.8475514 -0.7761684  0.68117191 -0.8676594
## cyl  -0.8521620  1.0000000  0.9020329  0.8324475 -0.69993811  0.7824958
## disp -0.8475514  0.9020329  1.0000000  0.7909486 -0.71021393  0.8879799
## hp   -0.7761684  0.8324475  0.7909486  1.0000000 -0.44875912  0.6587479
## drat  0.6811719 -0.6999381 -0.7102139 -0.4487591  1.00000000 -0.7124406
## wt   -0.8676594  0.7824958  0.8879799  0.6587479 -0.71244065  1.0000000
## qsec  0.4186840 -0.5912421 -0.4336979 -0.7082234  0.09120476 -0.1747159
## vs    0.6640389 -0.8108118 -0.7104159 -0.7230967  0.44027846 -0.5549157
## am    0.5998324 -0.5226070 -0.5912270 -0.2432043  0.71271113 -0.6924953
## gear  0.4802848 -0.4926866 -0.5555692 -0.1257043  0.69961013 -0.5832870
## carb -0.5509251  0.5269883  0.3949769  0.7498125 -0.09078980  0.4276059
##             qsec         vs          am       gear        carb
## mpg   0.41868403  0.6640389  0.59983243  0.4802848 -0.55092507
## cyl  -0.59124207 -0.8108118 -0.52260705 -0.4926866  0.52698829
## disp -0.43369788 -0.7104159 -0.59122704 -0.5555692  0.39497686
## hp   -0.70822339 -0.7230967 -0.24320426 -0.1257043  0.74981247
## drat  0.09120476  0.4402785  0.71271113  0.6996101 -0.09078980
## wt   -0.17471588 -0.5549157 -0.69249526 -0.5832870  0.42760594
## qsec  1.00000000  0.7445354 -0.22986086 -0.2126822 -0.65624923
## vs    0.74453544  1.0000000  0.16834512  0.2060233 -0.56960714
## am   -0.22986086  0.1683451  1.00000000  0.7940588  0.05753435
## gear -0.21268223  0.2060233  0.79405876  1.0000000  0.27407284
## carb -0.65624923 -0.5696071  0.05753435  0.2740728  1.00000000

cov(mtcars[1:3])

##              mpg        cyl       disp
## mpg    36.324103  -9.172379  -633.0972
## cyl    -9.172379   3.189516   199.6603
## disp -633.097208 199.660282 15360.7998

#example2:
setwd('~/lecture/riii')
gdp = read.csv("data/gdp.csv",header=TRUE)
#gdp = gdp[1:15,]
gdp = gdp[complete.cases(gdp),]
gdp$GDP = as.numeric(sub(",", "", gdp$GDP))
gdp$Export = as.numeric(sub(",", "", gdp$Export))
cor(gdp$Export, gdp$GDP)

## [1] 0.982525

統計圖

#屬貭資料: 長條圖、圓餅圖
barplot(table(cdc$smoke100))

?barplot
barplot(table(cdc$smoke100),xlab='有無吸菸',ylab='人數',main='title',col='blue',family="Songti SC")

pie(table(cdc$smoke100))

pie(table(cdc$genhlth))

pie(table(cdc$genhlth),col = rainbow(5))

#加上各分類比例
pct = round(table(cdc$genhlth) / length(cdc$genhlth) *100,1)
labels = paste(names(pct),pct,"%")
pie(table(cdc$genhlth), labels = labels)

gender_smokers = table(cdc$gender,cdc$smoke100)

mosaicplot(gender_smokers)

#屬量資料: 直方圖、盒鬚圖、莖葉圖
hist(cdc$age)

par(mfrow=c(3,1))
hist(cdc$height)
hist(cdc$height,breaks = 30)
hist(cdc$height,breaks = 50)

stem(cdc$age)

## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##   1 | 88888888888888888888888888888888888888888888888888888888888888888888+509
##   2 | 00000000000000000000000000000000000000000000000000000000000000000000+1694
##   2 | 55555555555555555555555555555555555555555555555555555555555555555555+1835
##   3 | 00000000000000000000000000000000000000000000000000000000000000000000+1954
##   3 | 55555555555555555555555555555555555555555555555555555555555555555555+2154
##   4 | 00000000000000000000000000000000000000000000000000000000000000000000+2109
##   4 | 55555555555555555555555555555555555555555555555555555555555555555555+1842
##   5 | 00000000000000000000000000000000000000000000000000000000000000000000+1578
##   5 | 55555555555555555555555555555555555555555555555555555555555555555555+1224
##   6 | 00000000000000000000000000000000000000000000000000000000000000000000+969
##   6 | 55555555555555555555555555555555555555555555555555555555555555555555+975
##   7 | 00000000000000000000000000000000000000000000000000000000000000000000+889
##   7 | 55555555555555555555555555555555555555555555555555555555555555555555+614
##   8 | 00000000000000000000000000000000000000000000000000000000000000000000+344
##   8 | 55555555555555555555555555555555555555555555555555555555566666666666+69
##   9 | 00000000011111111112222223333333444
##   9 | 556799

tmp=sample(cdc$age,100)
stem(sample(cdc$age,100))

## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##   1 | 88889
##   2 | 122234455677777788899
##   3 | 011222333444577999
##   4 | 00001111233555667889
##   5 | 00011224566789
##   6 | 001135577799
##   7 | 24568
##   8 | 1125
##   9 | 4

?stem
stem(sample(cdc$age,100),scale=2)

## 
##   The decimal point is 1 digit(s) to the right of the |
## 
##   1 | 8889
##   2 | 112333444
##   2 | 555577889
##   3 | 00011233444
##   3 | 5678888899
##   4 | 00001122223334444
##   4 | 55666678889
##   5 | 01133334
##   5 | 5556777799
##   6 | 2
##   6 | 588
##   7 | 02444
##   7 | 
##   8 | 03

par(mfrow=c(1,1))
boxplot(cdc$weight)

boxplot(cdc$weight, horizontal=TRUE)

boxplot(cdc$weight ~ cdc$gender)

boxplot(cdc$height ~ cdc$gender)

bmi = (cdc$weight/cdc$height^2) * 703
boxplot(bmi ~ cdc$genhlth)

#觀察兩組資料間關係:點散布圖
plot(cdc$weight, cdc$height)

plot(cdc$weight, cdc$wtdesire)

png(filename='test123.png')
plot(cdc$weight, cdc$height)
dev.off()

## quartz_off_screen 
##                 2

data explorer

#install.packages('DataExplorer')
library('DataExplorer')
help(package = 'DataExplorer')

introduce(iris)

##   rows columns discrete_columns continuous_columns all_missing_columns
## 1  150       5                1                  4                   0
##   total_missing_values total_observations memory_usage
## 1                    0                750         7256

dummify(iris)

##     Sepal.Length Sepal.Width Petal.Length Petal.Width Species_setosa
## 1            5.1         3.5          1.4         0.2              1
## 2            4.9         3.0          1.4         0.2              1
## 3            4.7         3.2          1.3         0.2              1
## 4            4.6         3.1          1.5         0.2              1
## 5            5.0         3.6          1.4         0.2              1
## 6            5.4         3.9          1.7         0.4              1
## 7            4.6         3.4          1.4         0.3              1
## 8            5.0         3.4          1.5         0.2              1
## 9            4.4         2.9          1.4         0.2              1
## 10           4.9         3.1          1.5         0.1              1
## 11           5.4         3.7          1.5         0.2              1
## 12           4.8         3.4          1.6         0.2              1
## 13           4.8         3.0          1.4         0.1              1
## 14           4.3         3.0          1.1         0.1              1
## 15           5.8         4.0          1.2         0.2              1
## 16           5.7         4.4          1.5         0.4              1
## 17           5.4         3.9          1.3         0.4              1
## 18           5.1         3.5          1.4         0.3              1
## 19           5.7         3.8          1.7         0.3              1
## 20           5.1         3.8          1.5         0.3              1
## 21           5.4         3.4          1.7         0.2              1
## 22           5.1         3.7          1.5         0.4              1
## 23           4.6         3.6          1.0         0.2              1
## 24           5.1         3.3          1.7         0.5              1
## 25           4.8         3.4          1.9         0.2              1
## 26           5.0         3.0          1.6         0.2              1
## 27           5.0         3.4          1.6         0.4              1
## 28           5.2         3.5          1.5         0.2              1
## 29           5.2         3.4          1.4         0.2              1
## 30           4.7         3.2          1.6         0.2              1
## 31           4.8         3.1          1.6         0.2              1
## 32           5.4         3.4          1.5         0.4              1
## 33           5.2         4.1          1.5         0.1              1
## 34           5.5         4.2          1.4         0.2              1
## 35           4.9         3.1          1.5         0.2              1
## 36           5.0         3.2          1.2         0.2              1
## 37           5.5         3.5          1.3         0.2              1
## 38           4.9         3.6          1.4         0.1              1
## 39           4.4         3.0          1.3         0.2              1
## 40           5.1         3.4          1.5         0.2              1
## 41           5.0         3.5          1.3         0.3              1
## 42           4.5         2.3          1.3         0.3              1
## 43           4.4         3.2          1.3         0.2              1
## 44           5.0         3.5          1.6         0.6              1
## 45           5.1         3.8          1.9         0.4              1
## 46           4.8         3.0          1.4         0.3              1
## 47           5.1         3.8          1.6         0.2              1
## 48           4.6         3.2          1.4         0.2              1
## 49           5.3         3.7          1.5         0.2              1
## 50           5.0         3.3          1.4         0.2              1
## 51           7.0         3.2          4.7         1.4              0
## 52           6.4         3.2          4.5         1.5              0
## 53           6.9         3.1          4.9         1.5              0
## 54           5.5         2.3          4.0         1.3              0
## 55           6.5         2.8          4.6         1.5              0
## 56           5.7         2.8          4.5         1.3              0
## 57           6.3         3.3          4.7         1.6              0
## 58           4.9         2.4          3.3         1.0              0
## 59           6.6         2.9          4.6         1.3              0
## 60           5.2         2.7          3.9         1.4              0
## 61           5.0         2.0          3.5         1.0              0
## 62           5.9         3.0          4.2         1.5              0
## 63           6.0         2.2          4.0         1.0              0
## 64           6.1         2.9          4.7         1.4              0
## 65           5.6         2.9          3.6         1.3              0
## 66           6.7         3.1          4.4         1.4              0
## 67           5.6         3.0          4.5         1.5              0
## 68           5.8         2.7          4.1         1.0              0
## 69           6.2         2.2          4.5         1.5              0
## 70           5.6         2.5          3.9         1.1              0
## 71           5.9         3.2          4.8         1.8              0
## 72           6.1         2.8          4.0         1.3              0
## 73           6.3         2.5          4.9         1.5              0
## 74           6.1         2.8          4.7         1.2              0
## 75           6.4         2.9          4.3         1.3              0
## 76           6.6         3.0          4.4         1.4              0
## 77           6.8         2.8          4.8         1.4              0
## 78           6.7         3.0          5.0         1.7              0
## 79           6.0         2.9          4.5         1.5              0
## 80           5.7         2.6          3.5         1.0              0
## 81           5.5         2.4          3.8         1.1              0
## 82           5.5         2.4          3.7         1.0              0
## 83           5.8         2.7          3.9         1.2              0
## 84           6.0         2.7          5.1         1.6              0
## 85           5.4         3.0          4.5         1.5              0
## 86           6.0         3.4          4.5         1.6              0
## 87           6.7         3.1          4.7         1.5              0
## 88           6.3         2.3          4.4         1.3              0
## 89           5.6         3.0          4.1         1.3              0
## 90           5.5         2.5          4.0         1.3              0
## 91           5.5         2.6          4.4         1.2              0
## 92           6.1         3.0          4.6         1.4              0
## 93           5.8         2.6          4.0         1.2              0
## 94           5.0         2.3          3.3         1.0              0
## 95           5.6         2.7          4.2         1.3              0
## 96           5.7         3.0          4.2         1.2              0
## 97           5.7         2.9          4.2         1.3              0
## 98           6.2         2.9          4.3         1.3              0
## 99           5.1         2.5          3.0         1.1              0
## 100          5.7         2.8          4.1         1.3              0
## 101          6.3         3.3          6.0         2.5              0
## 102          5.8         2.7          5.1         1.9              0
## 103          7.1         3.0          5.9         2.1              0
## 104          6.3         2.9          5.6         1.8              0
## 105          6.5         3.0          5.8         2.2              0
## 106          7.6         3.0          6.6         2.1              0
## 107          4.9         2.5          4.5         1.7              0
## 108          7.3         2.9          6.3         1.8              0
## 109          6.7         2.5          5.8         1.8              0
## 110          7.2         3.6          6.1         2.5              0
## 111          6.5         3.2          5.1         2.0              0
## 112          6.4         2.7          5.3         1.9              0
## 113          6.8         3.0          5.5         2.1              0
## 114          5.7         2.5          5.0         2.0              0
## 115          5.8         2.8          5.1         2.4              0
## 116          6.4         3.2          5.3         2.3              0
## 117          6.5         3.0          5.5         1.8              0
## 118          7.7         3.8          6.7         2.2              0
## 119          7.7         2.6          6.9         2.3              0
## 120          6.0         2.2          5.0         1.5              0
## 121          6.9         3.2          5.7         2.3              0
## 122          5.6         2.8          4.9         2.0              0
## 123          7.7         2.8          6.7         2.0              0
## 124          6.3         2.7          4.9         1.8              0
## 125          6.7         3.3          5.7         2.1              0
## 126          7.2         3.2          6.0         1.8              0
## 127          6.2         2.8          4.8         1.8              0
## 128          6.1         3.0          4.9         1.8              0
## 129          6.4         2.8          5.6         2.1              0
## 130          7.2         3.0          5.8         1.6              0
## 131          7.4         2.8          6.1         1.9              0
## 132          7.9         3.8          6.4         2.0              0
## 133          6.4         2.8          5.6         2.2              0
## 134          6.3         2.8          5.1         1.5              0
## 135          6.1         2.6          5.6         1.4              0
## 136          7.7         3.0          6.1         2.3              0
## 137          6.3         3.4          5.6         2.4              0
## 138          6.4         3.1          5.5         1.8              0
## 139          6.0         3.0          4.8         1.8              0
## 140          6.9         3.1          5.4         2.1              0
## 141          6.7         3.1          5.6         2.4              0
## 142          6.9         3.1          5.1         2.3              0
## 143          5.8         2.7          5.1         1.9              0
## 144          6.8         3.2          5.9         2.3              0
## 145          6.7         3.3          5.7         2.5              0
## 146          6.7         3.0          5.2         2.3              0
## 147          6.3         2.5          5.0         1.9              0
## 148          6.5         3.0          5.2         2.0              0
## 149          6.2         3.4          5.4         2.3              0
## 150          5.9         3.0          5.1         1.8              0
##     Species_versicolor Species_virginica
## 1                    0                 0
## 2                    0                 0
## 3                    0                 0
## 4                    0                 0
## 5                    0                 0
## 6                    0                 0
## 7                    0                 0
## 8                    0                 0
## 9                    0                 0
## 10                   0                 0
## 11                   0                 0
## 12                   0                 0
## 13                   0                 0
## 14                   0                 0
## 15                   0                 0
## 16                   0                 0
## 17                   0                 0
## 18                   0                 0
## 19                   0                 0
## 20                   0                 0
## 21                   0                 0
## 22                   0                 0
## 23                   0                 0
## 24                   0                 0
## 25                   0                 0
## 26                   0                 0
## 27                   0                 0
## 28                   0                 0
## 29                   0                 0
## 30                   0                 0
## 31                   0                 0
## 32                   0                 0
## 33                   0                 0
## 34                   0                 0
## 35                   0                 0
## 36                   0                 0
## 37                   0                 0
## 38                   0                 0
## 39                   0                 0
## 40                   0                 0
## 41                   0                 0
## 42                   0                 0
## 43                   0                 0
## 44                   0                 0
## 45                   0                 0
## 46                   0                 0
## 47                   0                 0
## 48                   0                 0
## 49                   0                 0
## 50                   0                 0
## 51                   1                 0
## 52                   1                 0
## 53                   1                 0
## 54                   1                 0
## 55                   1                 0
## 56                   1                 0
## 57                   1                 0
## 58                   1                 0
## 59                   1                 0
## 60                   1                 0
## 61                   1                 0
## 62                   1                 0
## 63                   1                 0
## 64                   1                 0
## 65                   1                 0
## 66                   1                 0
## 67                   1                 0
## 68                   1                 0
## 69                   1                 0
## 70                   1                 0
## 71                   1                 0
## 72                   1                 0
## 73                   1                 0
## 74                   1                 0
## 75                   1                 0
## 76                   1                 0
## 77                   1                 0
## 78                   1                 0
## 79                   1                 0
## 80                   1                 0
## 81                   1                 0
## 82                   1                 0
## 83                   1                 0
## 84                   1                 0
## 85                   1                 0
## 86                   1                 0
## 87                   1                 0
## 88                   1                 0
## 89                   1                 0
## 90                   1                 0
## 91                   1                 0
## 92                   1                 0
## 93                   1                 0
## 94                   1                 0
## 95                   1                 0
## 96                   1                 0
## 97                   1                 0
## 98                   1                 0
## 99                   1                 0
## 100                  1                 0
## 101                  0                 1
## 102                  0                 1
## 103                  0                 1
## 104                  0                 1
## 105                  0                 1
## 106                  0                 1
## 107                  0                 1
## 108                  0                 1
## 109                  0                 1
## 110                  0                 1
## 111                  0                 1
## 112                  0                 1
## 113                  0                 1
## 114                  0                 1
## 115                  0                 1
## 116                  0                 1
## 117                  0                 1
## 118                  0                 1
## 119                  0                 1
## 120                  0                 1
## 121                  0                 1
## 122                  0                 1
## 123                  0                 1
## 124                  0                 1
## 125                  0                 1
## 126                  0                 1
## 127                  0                 1
## 128                  0                 1
## 129                  0                 1
## 130                  0                 1
## 131                  0                 1
## 132                  0                 1
## 133                  0                 1
## 134                  0                 1
## 135                  0                 1
## 136                  0                 1
## 137                  0                 1
## 138                  0                 1
## 139                  0                 1
## 140                  0                 1
## 141                  0                 1
## 142                  0                 1
## 143                  0                 1
## 144                  0                 1
## 145                  0                 1
## 146                  0                 1
## 147                  0                 1
## 148                  0                 1
## 149                  0                 1
## 150                  0                 1

plot_missing(iris)

plot_histogram(iris)

plot_boxplot(iris,by='Species')

plot_correlation(iris[-5])

plot_prcomp(iris)

#create_report(iris)

R_basic2

York Lin

2018年10月30日

readr package

read excel file

read json

read xml

Flow Control

Review seq()

break / next

p86 example

Function

p90 example

lapply sapply apply tapply

探索性資料分析

表格

apply example

統計量

Covariance & Correlation

統計圖

data explorer