國泰大數據競賽R語言教育訓練

R 基礎指令

data(iris)
class(iris)

## [1] "data.frame"

class(iris)

## [1] "data.frame"

str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

summary(iris)

##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
##

head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

head(iris, 10)

##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1           5.1         3.5          1.4         0.2  setosa
## 2           4.9         3.0          1.4         0.2  setosa
## 3           4.7         3.2          1.3         0.2  setosa
## 4           4.6         3.1          1.5         0.2  setosa
## 5           5.0         3.6          1.4         0.2  setosa
## 6           5.4         3.9          1.7         0.4  setosa
## 7           4.6         3.4          1.4         0.3  setosa
## 8           5.0         3.4          1.5         0.2  setosa
## 9           4.4         2.9          1.4         0.2  setosa
## 10          4.9         3.1          1.5         0.1  setosa

tail(iris)

##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 145          6.7         3.3          5.7         2.5 virginica
## 146          6.7         3.0          5.2         2.3 virginica
## 147          6.3         2.5          5.0         1.9 virginica
## 148          6.5         3.0          5.2         2.0 virginica
## 149          6.2         3.4          5.4         2.3 virginica
## 150          5.9         3.0          5.1         1.8 virginica

tail(iris, 10)

##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 141          6.7         3.1          5.6         2.4 virginica
## 142          6.9         3.1          5.1         2.3 virginica
## 143          5.8         2.7          5.1         1.9 virginica
## 144          6.8         3.2          5.9         2.3 virginica
## 145          6.7         3.3          5.7         2.5 virginica
## 146          6.7         3.0          5.2         2.3 virginica
## 147          6.3         2.5          5.0         1.9 virginica
## 148          6.5         3.0          5.2         2.0 virginica
## 149          6.2         3.4          5.4         2.3 virginica
## 150          5.9         3.0          5.1         1.8 virginica

iris[1:3,]

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa

iris[1:3,1]

## [1] 5.1 4.9 4.7

iris[1:3,"Sepal.Length"]

## [1] 5.1 4.9 4.7

head(iris[,1:2])

##   Sepal.Length Sepal.Width
## 1          5.1         3.5
## 2          4.9         3.0
## 3          4.7         3.2
## 4          4.6         3.1
## 5          5.0         3.6
## 6          5.4         3.9

head(iris$"Sepal.Length")

## [1] 5.1 4.9 4.7 4.6 5.0 5.4

five.Sepal.iris <- iris[1:5, c("Sepal.Length", "Sepal.Width")]
head(five.Sepal.iris)

##   Sepal.Length Sepal.Width
## 1          5.1         3.5
## 2          4.9         3.0
## 3          4.7         3.2
## 4          4.6         3.1
## 5          5.0         3.6

setosa.data <- iris[iris$Species=="setosa",1:5]
head(setosa.data)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

head(sort(iris$Sepal.Length, decreasing = TRUE))

## [1] 7.9 7.7 7.7 7.7 7.7 7.6

head(iris[order(iris$Sepal.Length, decreasing = TRUE),])

##     Sepal.Length Sepal.Width Petal.Length Petal.Width   Species
## 132          7.9         3.8          6.4         2.0 virginica
## 118          7.7         3.8          6.7         2.2 virginica
## 119          7.7         2.6          6.9         2.3 virginica
## 123          7.7         2.8          6.7         2.0 virginica
## 136          7.7         3.0          6.1         2.3 virginica
## 106          7.6         3.0          6.6         2.1 virginica

R 繪圖功能

#Pie Chart
table.iris = table(iris$Species)
pie(table.iris)

#Histogram
hist(iris$Sepal.Length)

#Box Plot
boxplot(Petal.Width ~ Species, data = iris)

#Scatter Plot
plot(x=iris$Petal.Length, y=iris$Petal.Width, col=iris$Species)

使用R 探索資料

#download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/lvr_prices.csv', 'lvr_prices.csv')

getwd()

## [1] "D:/OS DATA/Desktop"

library(readr)
lvr_prices <- read_csv("lvr_prices.csv")

## Warning: Missing column names filled in: 'X1' [1]

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   X1 = col_integer(),
##   land_sqmeter = col_double(),
##   trading_ymd = col_date(format = ""),
##   finish_ymd = col_date(format = ""),
##   building_sqmeter = col_double(),
##   room = col_integer(),
##   living_room = col_integer(),
##   bath = col_integer(),
##   total_price = col_integer(),
##   price_per_sqmeter = col_double(),
##   parking_sqmeter = col_double(),
##   parking_price = col_integer()
## )

## See spec(...) for full column specifications.

## Warning in rbind(names(probs), probs_f): number of columns of result is not
## a multiple of vector length (arg 1)

## Warning: 32 parsing failures.
## row # A tibble: 5 x 5 col     row         col   expected     actual             file expected   <int>       <chr>      <chr>      <chr>            <chr> actual 1  1282 total_price an integer 6700000000 'lvr_prices.csv' file 2  2243 total_price an integer 3882685600 'lvr_prices.csv' row 3  2244 total_price an integer 3373314400 'lvr_prices.csv' col 4  4629 total_price an integer 3050000000 'lvr_prices.csv' expected 5  5890 total_price an integer 3133800000 'lvr_prices.csv'
## ... ................. ... .......................................................... ........ .......................................................... ...... .......................................................... .... .......................................................... ... .......................................................... ... .......................................................... ........ ..........................................................
## See problems(...) for more details.

#View(lvr_prices)

資料探索

daan <- lvr_prices[lvr_prices$area == '大安區',]
sum(as.numeric(daan$total_price), na.rm = TRUE)

## [1] 2.79477e+11

mean(as.numeric(daan$total_price), na.rm = TRUE)

## [1] 29798170

zhongshan <- lvr_prices[lvr_prices$area == '中山區',c('address', 'total_price')]
idx <- order(zhongshan$total_price, decreasing = TRUE)
res <- zhongshan[idx,]
res[1:3,]

## # A tibble: 3 x 2
##                               address total_price
##                                 <chr>       <int>
## 1 臺北市中山區建國北路一段138巷1~30號  1850000000
## 2      臺北市中山區南京東路三段1~30號  1400000000
## 3               中山段二小段31~60地號  1084948034

getTopThree <- function(area){
  zhongshan <- lvr_prices[lvr_prices$area == area,]
  idx <- order(zhongshan$total_price, decreasing = TRUE)
  res <- zhongshan[idx,c('area', 'address', 'total_price')]
  return(res[1:3,])
}

getTopThree('大安區')

## # A tibble: 3 x 3
##     area                                address total_price
##    <chr>                                  <chr>       <int>
## 1 大安區 臺北市大安區羅斯福路三段283巷4弄1~30號  1869781219
## 2 大安區      臺北市大安區忠孝東路四段241~270號   971340000
## 3 大安區                  學府段三小段31~60地號   966660000

tapply(lvr_prices$total_price, lvr_prices$area, function(e)mean(e,na.rm=TRUE))

##   士林區   大同區   大安區   中山區   中正區   內湖區   文山區   北投區 
## 24139903 18063872 29798170 26708805 30154011 27905514 16953869 20626410 
##   松山區   信義區   南港區   萬華區 
## 25652125 24725051 25235793 13642289

price_per_sec <- tapply(lvr_prices$total_price, lvr_prices$area, function(e)mean(e,na.rm=TRUE))

barplot(sort(price_per_sec, decreasing = TRUE), main= "各區平均價", xlab = "區域", ylab = "價格", col="blue")

boxplot(log(total_price) ~ area, data = lvr_prices, main= "房價箱型圖", xlab = "區域", ylab = "價格(log)")

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 1 is not drawn

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 2 is not drawn

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 3 is not drawn

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 4 is not drawn

## Warning in bplt(at[i], wid = width[i], stats = z$stats[, i], out = z$out[z
## $group == : Outlier (-Inf) in boxplot 6 is not drawn

## 使用 dplyr 分析資料

# install.packages("dplyr")

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

#help(package='dplyr')


filter(lvr_prices, area == '中山區') %>% head()

## # A tibble: 6 x 29
##      X1   area  trading_target                               address
##   <int>  <chr>           <chr>                                 <chr>
## 1    13 中山區 房地(土地+建物)             臺北市中山區合江街31~60號
## 2    14 中山區 房地(土地+建物)   臺北市中山區中山北路二段183巷1~30號
## 3    16 中山區 房地(土地+建物)           臺北市中山區吉林路361~390號
## 4    17 中山區            土地               長安段三小段271~300地號
## 5    24 中山區 房地(土地+建物)       臺北市中山區林森北路485巷1~30號
## 6    39 中山區 房地(土地+建物) 臺北市中山區建國北路三段93巷5弄1~30號
## # ... with 25 more variables: land_sqmeter <dbl>, city_land_type <chr>,
## #   non_city_land_type <chr>, non_city_code <chr>, trading_ymd <date>,
## #   trading_num <chr>, floor <chr>, total_floor <chr>,
## #   building_type <chr>, main_purpose <chr>, built_with <chr>,
## #   finish_ymd <date>, building_sqmeter <dbl>, room <int>,
## #   living_room <int>, bath <int>, compartment <chr>, management <chr>,
## #   total_price <int>, price_per_sqmeter <dbl>, parking_type <chr>,
## #   parking_sqmeter <dbl>, parking_price <int>, comments <chr>,
## #   numbers <chr>

select(lvr_prices, total_price) %>% head()

## # A tibble: 6 x 1
##   total_price
##         <int>
## 1    18680000
## 2    20300000
## 3      132096
## 4     4200000
## 5    14000000
## 6      255000

lvr_prices %>% 
  select(area, total_price) %>% 
  filter(area == '中山區') %>%
  head()

## # A tibble: 6 x 2
##     area total_price
##    <chr>       <int>
## 1 中山區     5960000
## 2 中山區    20200000
## 3 中山區     4050000
## 4 中山區     1900000
## 5 中山區    14800000
## 6 中山區    10200000

lvr_prices %>% 
  select(area, total_price) %>% 
  filter(area == '中山區') %>%
  arrange(total_price) %>%
  head()

## # A tibble: 6 x 2
##     area total_price
##    <chr>       <int>
## 1 中山區           0
## 2 中山區           0
## 3 中山區       10860
## 4 中山區       16000
## 5 中山區       18060
## 6 中山區       21244

lvr_prices %>% 
  select(area, total_price) %>% 
  filter(area == '中山區') %>%
  arrange(desc(total_price)) %>%
  head()

## # A tibble: 6 x 2
##     area total_price
##    <chr>       <int>
## 1 中山區  1850000000
## 2 中山區  1400000000
## 3 中山區  1084948034
## 4 中山區  1011136500
## 5 中山區   952875000
## 6 中山區   903865500

lvr_prices$trading_ym <- as.Date(format(lvr_prices$trading_ymd, '%Y-%m-01'))

lvr_stat <- lvr_prices %>% 
    select(trading_ym, area, total_price) %>% 
    filter(trading_ym >= '2012-01-01') %>% 
    group_by(trading_ym, area) %>% 
summarise(overall_price = sum(as.numeric(total_price), na.rm=TRUE))

lvr_stat$area <- as.factor(lvr_stat$area)

par(mfrow=c(3,4))
for (a in levels(lvr_stat$area)){
  plot(overall_price ~ trading_ym
       ,lvr_stat[lvr_stat$area == a,]
       , type='l', main = a)
}

library(tidyr)

price_pivot <- spread(lvr_stat, trading_ym, overall_price, fill=0)

View(price_pivot)

write.csv(price_pivot, 'taipei_house_price.csv')

分類問題

download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/Training50.csv', 'Training50.csv')

library(rpart)
trainset <- read.csv('Training50.csv')
trainset$X <- NULL
trainset$Creditability = as.factor(trainset$Creditability)
head(trainset)

##   Creditability Account.Balance Duration.of.Credit..month.
## 1             1               3                          6
## 2             0               1                         15
## 3             0               1                         42
## 4             0               3                         36
## 5             1               3                         24
## 6             1               1                         15
##   Payment.Status.of.Previous.Credit Purpose Credit.Amount
## 1                                 2       3          2108
## 2                                 1       4           950
## 3                                 2       3          7174
## 4                                 3       4          7980
## 5                                 3       2          2028
## 6                                 2       4          2511
##   Value.Savings.Stocks Length.of.current.employment Instalment.per.cent
## 1                    1                            3                   2
## 2                    1                            4                   4
## 3                    4                            3                   4
## 4                    4                            1                   4
## 5                    1                            3                   2
## 6                    1                            1                   1
##   Sex...Marital.Status Guarantors Duration.in.Current.address
## 1                    3          1                           2
## 2                    2          1                           3
## 3                    1          1                           3
## 4                    2          1                           4
## 5                    2          1                           2
## 6                    1          1                           4
##   Most.valuable.available.asset Age..years. Concurrent.Credits
## 1                             1          29                  2
## 2                             3          33                  2
## 3                             3          30                  2
## 4                             3          27                  2
## 5                             2          30                  2
## 6                             3          23                  2
##   Type.of.apartment No.of.Credits.at.this.Bank Occupation No.of.dependents
## 1                 1                          1          1                1
## 2                 1                          2          1                2
## 3                 2                          1          1                1
## 4                 1                          2          1                1
## 5                 2                          2          1                1
## 6                 1                          1          1                1
##   Telephone Foreign.Worker
## 1         1              1
## 2         1              1
## 3         2              1
## 4         2              1
## 5         1              1
## 6         1              1

library(randomForest)

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

model <- rpart(Creditability ~ ., data=trainset, method = 'class') 

#summary(model)

plot(model, margin = 0.1)
text(model, pretty=0,cex=0.6)

download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/Test50.csv', 'Test50.csv')

testset <- read.csv('Test50.csv')
testset$X <- NULL
testset$Creditability = as.factor(testset$Creditability)
predicted <- predict(model, testset, type = "class")
table(predicted, testset$Creditability)

##          
## predicted   0   1
##         0  64  52
##         1  93 291

繪製ROC Curve

#install.packages('caret')
#install.packages('e1071')
library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

prediction <- predict(model, testset, type = "prob")

roc_x <- c(0)
roc_y <- c(0)
for(i in seq(0,1,0.01)){
  res <- as.factor(ifelse(prediction[,1] >= i, 0, 1))

  tb <- table(testset$Creditability, res)
  if (ncol(tb) == 2){
    cm <- confusionMatrix(tb)
    x <- 1 - cm$byClass[2]
    y <- cm$byClass[1]
    roc_x <- c(roc_x, x)
    roc_y <- c(roc_y, y)
  }
}
roc_x <- c(roc_x, 1)
roc_y <- c(roc_y, 1)
plot(roc_x, roc_y, type='b')

#install.packages('ROCR')
library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

predictions <- predict(model, testset, type="prob")
pred.to.roc <- predictions[, 2] 
pred.rocr <- prediction(pred.to.roc, as.factor(testset$Creditability)) 
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr") 
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

## 使用隨機森林 (Random Forest)

library(randomForest)

forest <- randomForest(Creditability ~., data = trainset, ntree=200, importance=T, proximity=T) 

forest.predicted <- predict(forest, testset, type = "class")
table(forest.predicted, testset$Creditability)

##                 
## forest.predicted   0   1
##                0  53  25
##                1 104 318

(316 + 53) / (316+53+27+104)

## [1] 0.738

predictions1 <- predict(model, testset, type="prob")
pred.to.roc1 <- predictions1[, 2] 
pred.rocr1 <- prediction(pred.to.roc1, as.factor(testset$Creditability)) 
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")

predictions2 <- predict(forest, testset, type="prob")
pred.to.roc2 <- predictions2[, 2] 
pred.rocr2 <- prediction(pred.to.roc2, as.factor(testset$Creditability)) 
perf.rocr2 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")


plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart', 'randomforest'), 1:2)
plot(perf.tpr.rocr2, col=2, add=TRUE)

## 分群問題

#download.file('https://raw.githubusercontent.com/ywchiu/cathayr/master/data/customers.csv', 'customers.csv')
customers <- read.csv('customers.csv')
head(customers)

##   CustomerID  Genre Age Annual_Income Spending_Score
## 1          1   Male  19            15             39
## 2          2   Male  21            15             81
## 3          3 Female  20            16              6
## 4          4 Female  23            16             77
## 5          5 Female  31            17             40
## 6          6 Female  22            17             76

customers <- customers[,c('Annual_Income', 'Spending_Score')]

set.seed(123)
kc <-  kmeans(customers, centers = 5)
kc

## K-means clustering with 5 clusters of sizes 50, 27, 74, 39, 10
## 
## Cluster means:
##   Annual_Income Spending_Score
## 1      27.40000       49.48000
## 2      79.00000       16.59259
## 3      55.90541       49.93243
## 4      86.53846       82.12821
## 5     109.70000       22.00000
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [71] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4
## [141] 2 4 3 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2
## [176] 4 2 4 2 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4
## 
## Within cluster sum of squares by cluster:
## [1] 48174.480  4062.519  7375.000 13444.051  2458.100
##  (between_SS / total_SS =  72.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

plot(customers$Annual_Income, customers$Spending_Score, col=kc$cluster)
points(kc$centers[,1], kc$centers[,2], col='yellow')

library(cluster)
kcs = silhouette(kc$cluster,dist(customers))
summary(kcs)

## Silhouette of 200 units in 5 clusters from silhouette.default(x = kc$cluster, dist = dist(customers)) :
##  Cluster sizes and average silhouette widths:
##         50         27         74         39         10 
## 0.04103987 0.47253618 0.63162169 0.48851396 0.35704613 
## Individual silhouette widths:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.4401  0.2089  0.5292  0.4209  0.6421  0.7556

plot(kcs)

library(fpc)
nk <- 2:10
set.seed(123)
SW <- sapply(nk, function(k) {
  cluster.stats(dist(customers), kmeans(customers, centers=k)$cluster)$avg.silwidth
})
plot(nk, SW, type="l", xlab="number of clusers", ylab="average silhouette width")

國泰大數據競賽R語言教育訓練

David Chiu

2017年9月19日

R 基礎指令

R 繪圖功能

使用R 探索資料

資料探索

分類問題

繪製ROC Curve