Demo20180208

信用風險評估

trainset <- read.csv('https://raw.githubusercontent.com/ywchiu/fuboni/master/data/Training50.csv')

head(trainset)

##     X Creditability Account.Balance Duration.of.Credit..month.
## 1 497             1               3                          6
## 2 756             0               1                         15
## 3 580             0               1                         42
## 4 833             0               3                         36
## 5 602             1               3                         24
## 6 734             1               1                         15
##   Payment.Status.of.Previous.Credit Purpose Credit.Amount
## 1                                 2       3          2108
## 2                                 1       4           950
## 3                                 2       3          7174
## 4                                 3       4          7980
## 5                                 3       2          2028
## 6                                 2       4          2511
##   Value.Savings.Stocks Length.of.current.employment Instalment.per.cent
## 1                    1                            3                   2
## 2                    1                            4                   4
## 3                    4                            3                   4
## 4                    4                            1                   4
## 5                    1                            3                   2
## 6                    1                            1                   1
##   Sex...Marital.Status Guarantors Duration.in.Current.address
## 1                    3          1                           2
## 2                    2          1                           3
## 3                    1          1                           3
## 4                    2          1                           4
## 5                    2          1                           2
## 6                    1          1                           4
##   Most.valuable.available.asset Age..years. Concurrent.Credits
## 1                             1          29                  2
## 2                             3          33                  2
## 3                             3          30                  2
## 4                             3          27                  2
## 5                             2          30                  2
## 6                             3          23                  2
##   Type.of.apartment No.of.Credits.at.this.Bank Occupation No.of.dependents
## 1                 1                          1          1                1
## 2                 1                          2          1                2
## 3                 2                          1          1                1
## 4                 1                          2          1                1
## 5                 2                          2          1                1
## 6                 1                          1          1                1
##   Telephone Foreign.Worker
## 1         1              1
## 2         1              1
## 3         2              1
## 4         2              1
## 5         1              1
## 6         1              1

library(rpart)

## Warning: package 'rpart' was built under R version 3.4.2

trainset$Creditability <- as.factor(trainset$Creditability)
paste(names(trainset), '', collapse = '+')

## [1] "X +Creditability +Account.Balance +Duration.of.Credit..month. +Payment.Status.of.Previous.Credit +Purpose +Credit.Amount +Value.Savings.Stocks +Length.of.current.employment +Instalment.per.cent +Sex...Marital.Status +Guarantors +Duration.in.Current.address +Most.valuable.available.asset +Age..years. +Concurrent.Credits +Type.of.apartment +No.of.Credits.at.this.Bank +Occupation +No.of.dependents +Telephone +Foreign.Worker "

model <- rpart( Creditability   ~  Account.Balance +Duration.of.Credit..month. +Payment.Status.of.Previous.Credit +Purpose +Credit.Amount +Value.Savings.Stocks +Length.of.current.employment +Instalment.per.cent +Sex...Marital.Status +Guarantors +Duration.in.Current.address +Most.valuable.available.asset +Age..years. +Concurrent.Credits +Type.of.apartment +No.of.Credits.at.this.Bank +Occupation +No.of.dependents +Telephone +Foreign.Worker , data = trainset , method = 'class')

#summary(model)

plot(model, margin = 0.1)
text(model)

testset <- read.csv('https://raw.githubusercontent.com/ywchiu/fuboni/master/data/Test50.csv')
testset$Creditability <- as.factor(testset$Creditability)
predicted <- predict(model, testset, type = 'class')

sum(testset$Creditability == predicted) / length(predicted)

## [1] 0.71

table(testset$Creditability, predicted)

##    predicted
##       0   1
##   0  64  93
##   1  52 291

prediction <- predict(model, testset, type = 'prob')

roc_x <- c(0)
roc_y <- c(0)

for(i in seq(0,1,0.1)){
  #print(i)
  res <- as.factor(ifelse(prediction[,1] >= i, 0, 1))
  tb  <- table(testset$Creditability, res)
  if (ncol(tb) == 2){
    tp <- tb[1,1]
    fn <- tb[2,1]
    fp <- tb[1,2]
    tn <- tb[2,2]
    tpr <- tp / (tp + fn)
    fpr <- fp / (fp + tn)
    roc_y <- c(roc_y, tpr)
    roc_x <- c(roc_x, fpr)
  }
}

roc_y <- c(roc_y, 1)
roc_x <- c(roc_x, 1)

plot(roc_x, roc_y, type = 'b')

library(ROCR)

## Warning: package 'ROCR' was built under R version 3.4.2

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

pred.to.roc <- prediction[, 2] 
pred.rocr <- prediction(pred.to.roc, as.factor(testset$Creditability)) 
perf.rocr <- performance(pred.rocr, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr <- performance(pred.rocr, "tpr","fpr") 
plot(perf.tpr.rocr, colorize=T,main=paste("AUC:",(perf.rocr@y.values)))

隨機森林

library(randomForest)

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

forest <- randomForest( Creditability   ~  Account.Balance +Duration.of.Credit..month. +Payment.Status.of.Previous.Credit +Purpose +Credit.Amount +Value.Savings.Stocks +Length.of.current.employment +Instalment.per.cent +Sex...Marital.Status +Guarantors +Duration.in.Current.address +Most.valuable.available.asset +Age..years. +Concurrent.Credits +Type.of.apartment +No.of.Credits.at.this.Bank +Occupation +No.of.dependents +Telephone +Foreign.Worker , data = trainset ,ntree = 200, importance=T, proximity = T)

predicted <- predict(forest, testset, type = 'class')

sum(predicted  == testset$Creditability) / length(predicted)

## [1] 0.726

table(testset$Creditability, predicted)

##    predicted
##       0   1
##   0  49 108
##   1  29 314

# Decision Tree
predictions1 <- predict(model, testset, type="prob")
pred.to.roc1 <- predictions1[, 2] 
pred.rocr1 <- prediction(pred.to.roc1, as.factor(testset$Creditability)) 
perf.rocr1 <- performance(pred.rocr1, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr1 <- performance(pred.rocr1, "tpr","fpr")

# Random Forest
predictions2 <- predict(forest, testset, type="prob")
pred.to.roc2 <- predictions2[, 2] 
pred.rocr2 <- prediction(pred.to.roc2, as.factor(testset$Creditability)) 
perf.rocr2 <- performance(pred.rocr2, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr2 <- performance(pred.rocr2, "tpr","fpr")

## SVM
library(e1071)

## Warning: package 'e1071' was built under R version 3.4.3

svmfit <- svm( Creditability   ~  Account.Balance +Duration.of.Credit..month. +Payment.Status.of.Previous.Credit +Purpose +Credit.Amount +Value.Savings.Stocks +Length.of.current.employment +Instalment.per.cent +Sex...Marital.Status +Guarantors +Duration.in.Current.address +Most.valuable.available.asset +Age..years. +Concurrent.Credits +Type.of.apartment +No.of.Credits.at.this.Bank +Occupation +No.of.dependents +Telephone +Foreign.Worker , data = trainset, probability=TRUE )

## Warning in svm.default(x, y, scale = scale, ..., na.action = na.action):
## Variable(s) 'Occupation' constant. Cannot scale data.

predictions3 <- predict(svmfit, testset, probability = TRUE, kernel='linear')


pred.to.roc3 <- attr(predictions3,"probabilities")[, 1] 
pred.rocr3 <- prediction(pred.to.roc3, as.factor(testset$Creditability)) 
perf.rocr3 <- performance(pred.rocr3, measure = "auc", x.measure = "cutoff") 
perf.tpr.rocr3 <- performance(pred.rocr3, "tpr","fpr")


plot(perf.tpr.rocr1,main='ROC Curve', col=1)
legend(0.7, 0.2, c('rpart', 'randomforest', 'svm'), 1:3)
plot(perf.tpr.rocr2, col=2, add=TRUE)
plot(perf.tpr.rocr3, col=3, add=TRUE)

#save(x = forest, file = 'forest.RData')

Using Caret

library(caret)

## Warning: package 'caret' was built under R version 3.4.2

## Loading required package: lattice

## Loading required package: ggplot2

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

data(iris)
TrainData <- iris[,1:4]
TrainClasses <- iris[,5]

fit1 <- train(TrainData, TrainClasses,
            method = "knn",
            preProcess = c("center", "scale"),
            tuneLength = 10,
            trControl = trainControl(method = "cv"))

Clustering

data(iris)
hc <- hclust(dist(iris[,-5], method = 'euclidean'), method = 'ward.D2')

plot(hc, hang=-0.1,cex = 0.7)

hc2 <- hclust(dist(iris[,-5], method = 'euclidean'), method = 'single')

plot(hc2, hang=-0.1,cex = 0.7)

fit <- cutree(hc, k = 3)
fit

##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
##  [71] 2 2 2 2 2 2 2 3 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 3 2 3 3 3
## [106] 3 2 3 3 3 3 3 3 2 2 3 3 3 3 2 3 2 3 2 3 3 2 2 3 3 3 3 3 2 2 3 3 3 2 3
## [141] 3 3 2 3 3 3 2 3 3 2

plot(hc)
rect.hclust(hc, k = 3, border = 'red')

par(mfrow = c(1,2))
plot(iris$Petal.Length, iris$Petal.Width , col = fit)
plot(iris$Petal.Length, iris$Petal.Width , col = iris$Species)

KMeans

set.seed(42)
fit <- kmeans(iris[,-5], 3 )
barplot(t(fit$centers), beside = TRUE, xlab = 'cluster', ylab = 'value')

plot(Petal.Width ~ Petal.Length, data = iris, col = fit$cluster)

par(mfrow = c(1,2))
plot(Petal.Width ~ Petal.Length, data = iris, col = iris$Species)
plot(Petal.Width ~ Petal.Length, data = iris, col = fit$cluster)

Silhouette

set.seed(22)
km <- kmeans(iris[,-5], 3)

library(cluster)
kms <- silhouette(km$cluster, dist= dist(iris[,-5]))
summary(kms)

## Silhouette of 150 units in 3 clusters from silhouette.default(x = km$cluster, dist = dist(iris[, -5])) :
##  Cluster sizes and average silhouette widths:
##        50        62        38 
## 0.7981405 0.4173199 0.4511051 
## Individual silhouette widths:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.02636 0.39115 0.56231 0.55282 0.77552 0.85391

plot(kms)

nk <- 2:10
set.seed(22)
WSS <- sapply(nk, function(k) {
  kmeans(iris[,-5], centers=k)$tot.withinss
})
WSS

## [1] 152.34795  78.85144  57.26562  46.44618  42.42965  37.39514  34.75335
## [8]  41.96594  33.71865

plot(nk, WSS, type="l", xlab= "number of k", ylab="within sum of squares")

library(fpc)

## Warning: package 'fpc' was built under R version 3.4.2

nk <- 2:10
set.seed(256)
SW <-  sapply(nk, function(k) {
  cluster.stats(dist(iris[,-5]), kmeans(iris[,-5], centers=k)$cluster)$avg.silwidth
})
plot(nk, SW, type="l", xlab="number of clusers", ylab="average silhouette width")

single_c   <-  hclust(dist(iris[,-5]), method="single")
hc_single  <-  cutree(single_c, k = 3)

complete_c <-  hclust(dist(iris[,-5]), method="complete")
hc_complete <-  cutree(complete_c, k = 3)

set.seed(42)
km <- kmeans(iris[,-5], 3)

cs <- cluster.stats(dist(iris[,-5]), km$cluster)
cs[c("within.cluster.ss","avg.silwidth")]

## $within.cluster.ss
## [1] 78.85144
## 
## $avg.silwidth
## [1] 0.552819

sapply(list(kmeans = km$cluster, hc_single = hc_single, hc_complete = hc_complete), function(c) cluster.stats(dist(iris[,-5]), c)[c("within.cluster.ss","avg.silwidth")])

##                   kmeans   hc_single hc_complete
## within.cluster.ss 78.85144 142.4794  89.52501   
## avg.silwidth      0.552819 0.5121108 0.5135953

Customer Segmentation

customers <- read.csv('https://raw.githubusercontent.com/ywchiu/fuboni/master/data/customers.csv')

head(customers)

##   CustomerID  Genre Age Annual.Income..k.. Spending.Score..1.100.
## 1          1   Male  19                 15                     39
## 2          2   Male  21                 15                     81
## 3          3 Female  20                 16                      6
## 4          4 Female  23                 16                     77
## 5          5 Female  31                 17                     40
## 6          6 Female  22                 17                     76

names(customers)

## [1] "CustomerID"             "Genre"                 
## [3] "Age"                    "Annual.Income..k.."    
## [5] "Spending.Score..1.100."

customers <- customers[, 4:5]

set.seed(123)
kc <- kmeans(customers, 5)
kc

## K-means clustering with 5 clusters of sizes 50, 27, 74, 39, 10
## 
## Cluster means:
##   Annual.Income..k.. Spending.Score..1.100.
## 1           27.40000               49.48000
## 2           79.00000               16.59259
## 3           55.90541               49.93243
## 4           86.53846               82.12821
## 5          109.70000               22.00000
## 
## Clustering vector:
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
##  [71] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [106] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4
## [141] 2 4 3 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2 4 2
## [176] 4 2 4 2 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4 5 4
## 
## Within cluster sum of squares by cluster:
## [1] 48174.480  4062.519  7375.000 13444.051  2458.100
##  (between_SS / total_SS =  72.0 %)
## 
## Available components:
## 
## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

plot(customers$Annual.Income..k.., customers$Spending.Score..1.100., col = kc$cluster)
points(kc$centers[,1], kc$centers[,2], col="orange", cex = 3, pch=19)

DBSCAN

#install.packages('png')
library(png)
getwd()

## [1] "D:/OS DATA/Documents"

img1 <- readPNG("17.png", TRUE)
#img1
img3 <- img1[,nrow(img1):1]
b <- cbind(as.integer(which(img3 < -1) %% 28), which(img3 < -1) / 28)
plot(b, xlim=c(1,28), ylim=c(1,28))

set.seed(123)
fit <- kmeans(b, 2)
plot(b, col=fit$cluster, xlim=c(1,28), ylim=c(1,28))
points(fit$centers, col = 'orange', pch = 19)

library(fpc)
ds <- dbscan(b, 2)
plot(ds, b)

Feature Selection

#install.packages('FSelector')
library(FSelector)

## Warning: package 'FSelector' was built under R version 3.4.3

trainset$X <- NULL

weights <- random.forest.importance(Creditability ~ ., trainset, importance.type = 1)

print(weights)

##                                   attr_importance
## Account.Balance                        29.0169088
## Duration.of.Credit..month.             18.0253798
## Payment.Status.of.Previous.Credit      21.8627585
## Purpose                                 8.9936551
## Credit.Amount                          14.6702010
## Value.Savings.Stocks                   11.2432773
## Length.of.current.employment            3.0559581
## Instalment.per.cent                     5.1239967
## Sex...Marital.Status                    1.8639550
## Guarantors                             10.6877008
## Duration.in.Current.address             4.0700996
## Most.valuable.available.asset           6.2778230
## Age..years.                             1.7322556
## Concurrent.Credits                      5.6393773
## Type.of.apartment                       3.6570027
## No.of.Credits.at.this.Bank              2.5785666
## Occupation                              0.0000000
## No.of.dependents                        6.0041998
## Telephone                              -0.5021911
## Foreign.Worker                         -1.2592128

subset <- cutoff.k(weights, 5)
f <- as.simple.formula(subset, "Creditability")
f

## Creditability ~ Account.Balance + Payment.Status.of.Previous.Credit + 
##     Duration.of.Credit..month. + Credit.Amount + Value.Savings.Stocks
## <environment: 0x00000000341ce378>

library(caret)
control <- trainControl(method="repeatedcv", number=10, repeats=3)

model <-  train(Creditability~., data=trainset, method="rpart", preProcess="scale", trControl=control)

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

## Warning in preProcess.default(thresh = 0.95, k = 5, freqCut = 19, uniqueCut
## = 10, : These variables have zero variances: Occupation

importance <-  varImp(model, scale=FALSE)
plot(importance)

#install.packages("rminer")
library(rminer)

## Warning: package 'rminer' was built under R version 3.4.2

model <- fit(Creditability~.,trainset,model="rpart")

VariableImportance <- Importance(model,trainset,method="sensv")

L=list(runs=1,sen=t(VariableImportance$imp),sresponses=VariableImportance$sresponses)

mgraph(L,graph="IMP",leg=names(trainset),col="gray",Grid=10)

Evluator

 evaluator = function(subset) {
   k = 5  
   set.seed(2)
   ind = sample(5, nrow(trainset), replace = TRUE)
   results = sapply(1:k, function(i) {
     train = trainset[ind ==i,]
     test  = trainset[ind !=i,]
     tree  = rpart(as.simple.formula(subset, "Creditability"), trainset)
     error.rate = sum(test$Creditability != predict(tree, test, type="class")) / nrow(test)
     return(1 - error.rate)
   })
   return(mean(results))
 }

attr.subset = hill.climbing.search(names(trainset)[!names(trainset) %in% "Creditability"], evaluator)
f <-  as.simple.formula(attr.subset, "Creditability")
print(f)

## Creditability ~ Account.Balance + Duration.of.Credit..month. + 
##     Payment.Status.of.Previous.Credit + Purpose + Value.Savings.Stocks + 
##     Length.of.current.employment + Instalment.per.cent + Guarantors + 
##     Duration.in.Current.address + Age..years. + Concurrent.Credits + 
##     Telephone
## <environment: 0x0000000034847450>

Feature Extraction

data(swiss)

head(swiss)

##              Fertility Agriculture Examination Education Catholic
## Courtelary        80.2        17.0          15        12     9.96
## Delemont          83.1        45.1           6         9    84.84
## Franches-Mnt      92.5        39.7           5         5    93.40
## Moutier           85.8        36.5          12         7    33.77
## Neuveville        76.9        43.5          17        15     5.16
## Porrentruy        76.1        35.3           9         7    90.57
##              Infant.Mortality
## Courtelary               22.2
## Delemont                 22.2
## Franches-Mnt             20.2
## Moutier                  20.3
## Neuveville               20.6
## Porrentruy               26.6

swiss <-  swiss[,-1]
swiss.pca = prcomp(swiss,
   center = TRUE,
   scale  = TRUE)
swiss.pca

## Standard deviations (1, .., p=5):
## [1] 1.6228065 1.0354873 0.9033447 0.5592765 0.4067472
## 
## Rotation (n x k) = (5 x 5):
##                          PC1         PC2          PC3        PC4
## Agriculture       0.52396452 -0.25834215  0.003003672 -0.8090741
## Examination      -0.57185792 -0.01145981 -0.039840522 -0.4224580
## Education        -0.49150243  0.19028476  0.539337412 -0.3321615
## Catholic          0.38530580  0.36956307  0.725888143  0.1007965
## Infant.Mortality  0.09167606  0.87197641 -0.424976789 -0.2154928
##                          PC5
## Agriculture       0.06411415
## Examination      -0.70198942
## Education         0.56656945
## Catholic         -0.42176895
## Infant.Mortality  0.06488642

summary(swiss.pca)

## Importance of components%s:
##                           PC1    PC2    PC3     PC4     PC5
## Standard deviation     1.6228 1.0355 0.9033 0.55928 0.40675
## Proportion of Variance 0.5267 0.2145 0.1632 0.06256 0.03309
## Cumulative Proportion  0.5267 0.7411 0.9043 0.96691 1.00000

predict(swiss.pca, newdata=head(swiss, 1))

##                   PC1       PC2        PC3      PC4       PC5
## Courtelary -0.9390479 0.8047122 -0.8118681 1.000307 0.4618643

screeplot(swiss.pca, type="barplot")

screeplot(swiss.pca, type="line")

swiss.pca$sdev ^ 2

## [1] 2.6335008 1.0722340 0.8160316 0.3127902 0.1654433

which(swiss.pca$sdev ^ 2> 1)

## [1] 1 2

screeplot(swiss.pca, type="line")
abline(h=1, col="red", lty= 3)

head(swiss)

##              Agriculture Examination Education Catholic Infant.Mortality
## Courtelary          17.0          15        12     9.96             22.2
## Delemont            45.1           6         9    84.84             22.2
## Franches-Mnt        39.7           5         5    93.40             20.2
## Moutier             36.5          12         7    33.77             20.3
## Neuveville          43.5          17        15     5.16             20.6
## Porrentruy          35.3           9         7    90.57             26.6

plot(swiss.pca$x[,1], swiss.pca$x[,2], xlim=c(-4,4))
text(swiss.pca$x[,1], swiss.pca$x[,2], rownames(swiss.pca$x), cex=0.7, pos=4, col="red")

biplot(swiss.pca)

幸福指標

dataset <- read.csv('https://raw.githubusercontent.com/ywchiu/fuboni/master/data/eco_index.csv', row.names = 1)

dataset

##                  sales expense income.average
## Taipei          122838   14.83         704024
## Tainan           24689   14.56         430680
## New Taipei City  41719   13.10         506143
## Kaohsiung        41982   11.56         487309
## Taichung         35856   10.56         511603
## Hsinchu County    9665   22.57         569021
## Hsinchu          16017   12.27         604880
## Taoyuan County   35998   11.67         509530
## Pingtung County   3520   23.74         429733
## Hualien County    1614   19.54         475333
## Miaoli County     6231   17.11         432448
## Chiayi City       2131   19.58         474522
## Yilan County      2545   20.81         406145
## Changhua County  12930   12.30         417594
## Penghu County      249   28.51         412705
## Yunlin County    14071   16.57         412278
## Chiayi County     2658   24.22         392155
## Taitung            670   20.95         411274
## Keelung           1674   22.07         467203
## Nantou County     2510   14.87         405789

pc.cr <- princomp(dataset, cor = TRUE) 
plot(pc.cr)

screeplot(pc.cr, type = 'lines')

biplot(pc.cr)

barplot(-sort(pc.cr$scores[,1]), cex.axis = 0.5, cex.names = 0.5)

-sort(pc.cr$scores[,1])

##          Taipei         Hsinchu        Taichung  Taoyuan County 
##      4.44166677      1.47378234      1.36963836      1.24859975 
##       Kaohsiung New Taipei City          Tainan  Hsinchu County 
##      1.22361917      1.21393904      0.09492659      0.04573421 
## Changhua County   Yunlin County   Miaoli County     Chiayi City 
##     -0.05588516     -0.48721191     -0.56259111     -0.57010732 
##  Hualien County   Nantou County         Keelung    Yilan County 
##     -0.57174521     -0.63677543     -0.88013703     -1.21208605 
##         Taitung Pingtung County   Chiayi County   Penghu County 
##     -1.22879883     -1.29191378     -1.65058209     -1.96407231

Demo20180208

David Chiu

2018年2月8日

信用風險評估

隨機森林

Using Caret

Clustering

KMeans

Silhouette

Customer Segmentation

DBSCAN

Feature Selection

Evluator

Feature Extraction

幸福指標