library(AppliedPredictiveModeling)
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## 필요한 패키지를 로딩중입니다: ggplot2
## 필요한 패키지를 로딩중입니다: lattice
data(twoClassData)
head(predictors)
## PredictorA PredictorB
## 1 0.1582 0.1609
## 2 0.6552 0.4918
## 3 0.7060 0.6333
## 4 0.1992 0.0881
## 5 0.3952 0.4152
## 6 0.4250 0.2988
class(predictors)
## [1] "data.frame"
glimpse(predictors)
## Rows: 208
## Columns: 2
## $ PredictorA <dbl> 0.1582, 0.6552, 0.7060, 0.1992, 0.3952, 0.4250, 0.0658, 0.3…
## $ PredictorB <dbl> 0.1609, 0.4918, 0.6333, 0.0881, 0.4152, 0.2988, 0.1786, 0.2…
length(classes)
## [1] 208
set.seed(1)
traingRows<-createDataPartition(classes,p=.8,list=FALSE)
head(traingRows)
## Resample1
## [1,] 1
## [2,] 2
## [3,] 3
## [4,] 7
## [5,] 8
## [6,] 9
trainPredictors<-predictors[traingRows,]
trainClasses<-classes[traingRows]
testPredictors<-predictors[-traingRows,]
testClasses<-classes[-traingRows]
str(trainPredictors)
## 'data.frame': 167 obs. of 2 variables:
## $ PredictorA: num 0.1582 0.6552 0.706 0.0658 0.3086 ...
## $ PredictorB: num 0.161 0.492 0.633 0.179 0.28 ...
str(testPredictors)
## 'data.frame': 41 obs. of 2 variables:
## $ PredictorA: num 0.1992 0.3952 0.425 0.0847 0.2909 ...
## $ PredictorB: num 0.0881 0.4152 0.2988 0.0548 0.3021 ...
repeatedSplit<-createDataPartition(trainClasses,p=0.8,times = 3)
str(repeatedSplit)
## List of 3
## $ Resample1: int [1:135] 1 2 3 4 5 6 8 9 10 11 ...
## $ Resample2: int [1:135] 1 2 3 4 5 6 7 8 9 11 ...
## $ Resample3: int [1:135] 1 2 3 4 5 6 9 10 11 14 ...
set.seed(1)
cvSplit<-createFolds(trainClasses,k=10,returnTrain = TRUE)
str(cvSplit)
## List of 10
## $ Fold01: int [1:150] 1 2 4 5 6 7 8 10 11 13 ...
## $ Fold02: int [1:150] 1 2 3 4 6 7 8 9 10 11 ...
## $ Fold03: int [1:150] 1 3 4 5 6 7 8 9 10 11 ...
## $ Fold04: int [1:150] 1 2 3 4 5 6 7 8 9 10 ...
## $ Fold05: int [1:150] 2 3 4 5 6 7 8 9 10 11 ...
## $ Fold06: int [1:150] 1 2 3 4 5 6 7 8 9 11 ...
## $ Fold07: int [1:150] 1 2 3 4 5 6 7 9 10 12 ...
## $ Fold08: int [1:151] 1 2 3 4 5 6 8 9 10 11 ...
## $ Fold09: int [1:151] 1 2 3 5 6 7 8 9 10 11 ...
## $ Fold10: int [1:151] 1 2 3 4 5 7 8 9 10 11 ...
fold1<-cvSplit[[1]]
length(fold1)
## [1] 150
cvPredictors1<-trainPredictors[fold1,]
cvClasses1<-trainClasses[fold1]
nrow(trainPredictors)
## [1] 167
nrow(cvPredictors1)
## [1] 150
data('GermanCredit')
glimpse(GermanCredit)
## Rows: 1,000
## Columns: 62
## $ Duration <int> 6, 48, 12, 42, 24, 36, 24, 36, …
## $ Amount <int> 1169, 5951, 2096, 7882, 4870, 9…
## $ InstallmentRatePercentage <int> 4, 2, 2, 2, 3, 2, 3, 2, 2, 4, 3…
## $ ResidenceDuration <int> 4, 2, 3, 4, 4, 4, 4, 2, 4, 2, 1…
## $ Age <int> 67, 22, 49, 45, 53, 35, 53, 35,…
## $ NumberExistingCredits <int> 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1…
## $ NumberPeopleMaintenance <int> 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1…
## $ Telephone <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1…
## $ ForeignWorker <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Class <fct> Good, Bad, Good, Good, Bad, Goo…
## $ CheckingAccountStatus.lt.0 <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0…
## $ CheckingAccountStatus.0.to.200 <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1…
## $ CheckingAccountStatus.gt.200 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CheckingAccountStatus.none <dbl> 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0…
## $ CreditHistory.NoCredit.AllPaid <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CreditHistory.ThisBank.AllPaid <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CreditHistory.PaidDuly <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1…
## $ CreditHistory.Delay <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ CreditHistory.Critical <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0…
## $ Purpose.NewCar <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1…
## $ Purpose.UsedCar <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ Purpose.Furniture.Equipment <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0…
## $ Purpose.Radio.Television <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ Purpose.DomesticAppliance <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Purpose.Repairs <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Purpose.Education <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0…
## $ Purpose.Vacation <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Purpose.Retraining <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Purpose.Business <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Purpose.Other <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ SavingsAccountBonds.lt.100 <dbl> 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1…
## $ SavingsAccountBonds.100.to.500 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ SavingsAccountBonds.500.to.1000 <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ SavingsAccountBonds.gt.1000 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ SavingsAccountBonds.Unknown <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ EmploymentDuration.lt.1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ EmploymentDuration.1.to.4 <dbl> 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0…
## $ EmploymentDuration.4.to.7 <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0…
## $ EmploymentDuration.gt.7 <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ EmploymentDuration.Unemployed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ Personal.Male.Divorced.Seperated <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ Personal.Female.NotSingle <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ Personal.Male.Single <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0…
## $ Personal.Male.Married.Widowed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ Personal.Female.Single <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ OtherDebtorsGuarantors.None <dbl> 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1…
## $ OtherDebtorsGuarantors.CoApplicant <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ OtherDebtorsGuarantors.Guarantor <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ Property.RealEstate <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0…
## $ Property.Insurance <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0…
## $ Property.CarOther <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1…
## $ Property.Unknown <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0…
## $ OtherInstallmentPlans.Bank <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ OtherInstallmentPlans.Stores <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ OtherInstallmentPlans.None <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Housing.Rent <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1…
## $ Housing.Own <dbl> 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0…
## $ Housing.ForFree <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0…
## $ Job.UnemployedUnskilled <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Job.UnskilledResident <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0…
## $ Job.SkilledEmployee <dbl> 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1…
## $ Job.Management.SelfEmp.HighlyQualified <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0…
set.seed(1056)
GermanCredit <- GermanCredit[, -nearZeroVar(GermanCredit)]
GermanCredit$CheckingAccountStatus.lt.0 <- NULL
GermanCredit$SavingsAccountBonds.lt.100 <- NULL
GermanCredit$EmploymentDuration.lt.1 <- NULL
GermanCredit$EmploymentDuration.Unemployed <- NULL
GermanCredit$Personal.Male.Married.Widowed <- NULL
GermanCredit$Property.Unknown <- NULL
GermanCredit$Housing.ForFree <- NULL
inTrain <- createDataPartition(GermanCredit$Class, p = .8)[[1]]
GermanCreditTrain <- GermanCredit[ inTrain, ]
GermanCreditTest <- GermanCredit[-inTrain, ]
library(kernlab)
##
## 다음의 패키지를 부착합니다: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
#~. 는 독립변수들을 전부 집어 넣는다는 의미(마지막줄은 시험에꼭 나옴)
svmFit<-train(Class~.,
data=GermanCreditTrain,
method='svmRadial',
preProc=c('center','scale'),
tuneLength=10,
trControl=trainControl(method="cv",
repeats=5,
classProbs = TRUE))
## Warning: `repeats` has no meaning for this resampling method.
svmFit
## Support Vector Machines with Radial Basis Function Kernel
##
## 800 samples
## 41 predictor
## 2 classes: 'Bad', 'Good'
##
## Pre-processing: centered (41), scaled (41)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 720, 720, 720, 720, 720, 720, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.25 0.75625 0.3858886
## 0.50 0.76500 0.4087530
## 1.00 0.77000 0.3961280
## 2.00 0.76375 0.3782815
## 4.00 0.76500 0.3688453
## 8.00 0.77000 0.3892362
## 16.00 0.75125 0.3151667
## 32.00 0.73500 0.2797416
## 64.00 0.73250 0.2770492
## 128.00 0.72875 0.2713074
##
## Tuning parameter 'sigma' was held constant at a value of 0.0138183
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.0138183 and C = 1.
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## 다음의 패키지를 부착합니다: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
svmFit1<-train(Class~.,
data=GermanCreditTrain,
method='rf',
preProc=c('center','scale'),
tuneLength=10,
trControl=trainControl(method="cv",
repeats=5,
classProbs = TRUE))
## Warning: `repeats` has no meaning for this resampling method.
svmFit1
## Random Forest
##
## 800 samples
## 41 predictor
## 2 classes: 'Bad', 'Good'
##
## Pre-processing: centered (41), scaled (41)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 720, 720, 720, 720, 720, 720, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.73500 0.1888907
## 6 0.76375 0.3571717
## 10 0.77000 0.3815096
## 15 0.76500 0.3785229
## 19 0.76750 0.3918055
## 23 0.76500 0.3886457
## 28 0.76750 0.3954219
## 32 0.77000 0.4021965
## 36 0.76250 0.3870212
## 41 0.76625 0.3958864
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 10.
library(randomForest)
svmFit2<-train(Class~.,
data=GermanCreditTrain,
method='knn',
preProc=c('center','scale'),
tuneLength=10,
trControl=trainControl(method="cv",
repeats=5,
classProbs = TRUE))
## Warning: `repeats` has no meaning for this resampling method.
svmFit2
## k-Nearest Neighbors
##
## 800 samples
## 41 predictor
## 2 classes: 'Bad', 'Good'
##
## Pre-processing: centered (41), scaled (41)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 720, 720, 720, 720, 720, 720, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.71250 0.2453952
## 7 0.71625 0.2389015
## 9 0.72250 0.2360241
## 11 0.72250 0.2345050
## 13 0.72875 0.2389910
## 15 0.73500 0.2446996
## 17 0.74875 0.2797479
## 19 0.73750 0.2285858
## 21 0.73375 0.2148876
## 23 0.73500 0.2137900
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 17.
#해당되는 데이터는 svm이 가장 정확도가 높음
# tuneLength= 옵션은 평가되는 수를
#제어한다. PLS의 경우 이 함수는 1에서 tuneLength 까지의 정수 수열을 #사용한다. 1에서 15
# 사이의 모든 정수를 평가하려면 tuneLength=15로 설정하면 된다
plot(svmFit,scales=list(x=list(log=2)))

plot(svmFit1,scales=list(x=list(log=2)))

plot(svmFit2,scales=list(x=list(log=2)))

predicted<-predict(svmFit,GermanCreditTest)
# class별 확률을 확인할 때에는 'type'인자를 사용한다.
predictedProbs<-predict(svmFit,newdata=GermanCreditTest,
type="prob")
head(predictedProbs)
## Bad Good
## 1 0.4238216 0.5761784
## 2 0.0730835 0.9269165
## 3 0.5650184 0.4349816
## 4 0.6932819 0.3067181
## 5 0.1754360 0.8245640
## 6 0.3223266 0.6776734
predictedProbs1<-predict(svmFit1,newdata=GermanCreditTest,
type="prob")
head(predictedProbs1)
## Bad Good
## 4 0.604 0.396
## 7 0.054 0.946
## 16 0.480 0.520
## 19 0.738 0.262
## 22 0.186 0.814
## 33 0.340 0.660
predictedProbs2<-predict(svmFit2,newdata=GermanCreditTest,
type="prob")
head(predictedProbs2)
## Bad Good
## 1 0.41176471 0.5882353
## 2 0.05882353 0.9411765
## 3 0.35294118 0.6470588
## 4 0.41176471 0.5882353
## 5 0.23529412 0.7647059
## 6 0.35294118 0.6470588