bigdata_07-1

library(AppliedPredictiveModeling)
library(dplyr)

## 
## 다음의 패키지를 부착합니다: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(caret)

## 필요한 패키지를 로딩중입니다: ggplot2

## 필요한 패키지를 로딩중입니다: lattice

data(twoClassData)
head(predictors)

##   PredictorA PredictorB
## 1     0.1582     0.1609
## 2     0.6552     0.4918
## 3     0.7060     0.6333
## 4     0.1992     0.0881
## 5     0.3952     0.4152
## 6     0.4250     0.2988

class(predictors)

## [1] "data.frame"

glimpse(predictors)

## Rows: 208
## Columns: 2
## $ PredictorA <dbl> 0.1582, 0.6552, 0.7060, 0.1992, 0.3952, 0.4250, 0.0658, 0.3…
## $ PredictorB <dbl> 0.1609, 0.4918, 0.6333, 0.0881, 0.4152, 0.2988, 0.1786, 0.2…

length(classes)

## [1] 208

set.seed(1)
traingRows<-createDataPartition(classes,p=.8,list=FALSE)
head(traingRows)

##      Resample1
## [1,]         1
## [2,]         2
## [3,]         3
## [4,]         7
## [5,]         8
## [6,]         9

trainPredictors<-predictors[traingRows,]
trainClasses<-classes[traingRows]
testPredictors<-predictors[-traingRows,]
testClasses<-classes[-traingRows]

str(trainPredictors)

## 'data.frame':    167 obs. of  2 variables:
##  $ PredictorA: num  0.1582 0.6552 0.706 0.0658 0.3086 ...
##  $ PredictorB: num  0.161 0.492 0.633 0.179 0.28 ...

str(testPredictors)

## 'data.frame':    41 obs. of  2 variables:
##  $ PredictorA: num  0.1992 0.3952 0.425 0.0847 0.2909 ...
##  $ PredictorB: num  0.0881 0.4152 0.2988 0.0548 0.3021 ...

repeatedSplit<-createDataPartition(trainClasses,p=0.8,times = 3)
str(repeatedSplit)

## List of 3
##  $ Resample1: int [1:135] 1 2 3 4 5 6 8 9 10 11 ...
##  $ Resample2: int [1:135] 1 2 3 4 5 6 7 8 9 11 ...
##  $ Resample3: int [1:135] 1 2 3 4 5 6 9 10 11 14 ...

set.seed(1)
cvSplit<-createFolds(trainClasses,k=10,returnTrain = TRUE)
str(cvSplit)

## List of 10
##  $ Fold01: int [1:150] 1 2 4 5 6 7 8 10 11 13 ...
##  $ Fold02: int [1:150] 1 2 3 4 6 7 8 9 10 11 ...
##  $ Fold03: int [1:150] 1 3 4 5 6 7 8 9 10 11 ...
##  $ Fold04: int [1:150] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Fold05: int [1:150] 2 3 4 5 6 7 8 9 10 11 ...
##  $ Fold06: int [1:150] 1 2 3 4 5 6 7 8 9 11 ...
##  $ Fold07: int [1:150] 1 2 3 4 5 6 7 9 10 12 ...
##  $ Fold08: int [1:151] 1 2 3 4 5 6 8 9 10 11 ...
##  $ Fold09: int [1:151] 1 2 3 5 6 7 8 9 10 11 ...
##  $ Fold10: int [1:151] 1 2 3 4 5 7 8 9 10 11 ...

fold1<-cvSplit[[1]]
length(fold1)

## [1] 150

cvPredictors1<-trainPredictors[fold1,]
cvClasses1<-trainClasses[fold1]
nrow(trainPredictors)

## [1] 167

nrow(cvPredictors1)

## [1] 150

data('GermanCredit')
glimpse(GermanCredit)

## Rows: 1,000
## Columns: 62
## $ Duration                               <int> 6, 48, 12, 42, 24, 36, 24, 36, …
## $ Amount                                 <int> 1169, 5951, 2096, 7882, 4870, 9…
## $ InstallmentRatePercentage              <int> 4, 2, 2, 2, 3, 2, 3, 2, 2, 4, 3…
## $ ResidenceDuration                      <int> 4, 2, 3, 4, 4, 4, 4, 2, 4, 2, 1…
## $ Age                                    <int> 67, 22, 49, 45, 53, 35, 53, 35,…
## $ NumberExistingCredits                  <int> 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1…
## $ NumberPeopleMaintenance                <int> 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1…
## $ Telephone                              <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1…
## $ ForeignWorker                          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Class                                  <fct> Good, Bad, Good, Good, Bad, Goo…
## $ CheckingAccountStatus.lt.0             <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0…
## $ CheckingAccountStatus.0.to.200         <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1…
## $ CheckingAccountStatus.gt.200           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CheckingAccountStatus.none             <dbl> 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0…
## $ CreditHistory.NoCredit.AllPaid         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CreditHistory.ThisBank.AllPaid         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CreditHistory.PaidDuly                 <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1…
## $ CreditHistory.Delay                    <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ CreditHistory.Critical                 <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0…
## $ Purpose.NewCar                         <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1…
## $ Purpose.UsedCar                        <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ Purpose.Furniture.Equipment            <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0…
## $ Purpose.Radio.Television               <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ Purpose.DomesticAppliance              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Purpose.Repairs                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Purpose.Education                      <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0…
## $ Purpose.Vacation                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Purpose.Retraining                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Purpose.Business                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Purpose.Other                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ SavingsAccountBonds.lt.100             <dbl> 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1…
## $ SavingsAccountBonds.100.to.500         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ SavingsAccountBonds.500.to.1000        <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ SavingsAccountBonds.gt.1000            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ SavingsAccountBonds.Unknown            <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ EmploymentDuration.lt.1                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ EmploymentDuration.1.to.4              <dbl> 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0…
## $ EmploymentDuration.4.to.7              <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0…
## $ EmploymentDuration.gt.7                <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ EmploymentDuration.Unemployed          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ Personal.Male.Divorced.Seperated       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ Personal.Female.NotSingle              <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ Personal.Male.Single                   <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0…
## $ Personal.Male.Married.Widowed          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ Personal.Female.Single                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ OtherDebtorsGuarantors.None            <dbl> 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1…
## $ OtherDebtorsGuarantors.CoApplicant     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ OtherDebtorsGuarantors.Guarantor       <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ Property.RealEstate                    <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0…
## $ Property.Insurance                     <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0…
## $ Property.CarOther                      <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1…
## $ Property.Unknown                       <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0…
## $ OtherInstallmentPlans.Bank             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ OtherInstallmentPlans.Stores           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ OtherInstallmentPlans.None             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Housing.Rent                           <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1…
## $ Housing.Own                            <dbl> 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0…
## $ Housing.ForFree                        <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0…
## $ Job.UnemployedUnskilled                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Job.UnskilledResident                  <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0…
## $ Job.SkilledEmployee                    <dbl> 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1…
## $ Job.Management.SelfEmp.HighlyQualified <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0…

set.seed(1056)
GermanCredit <- GermanCredit[, -nearZeroVar(GermanCredit)]
GermanCredit$CheckingAccountStatus.lt.0 <- NULL
GermanCredit$SavingsAccountBonds.lt.100 <- NULL
GermanCredit$EmploymentDuration.lt.1 <- NULL
GermanCredit$EmploymentDuration.Unemployed <- NULL
GermanCredit$Personal.Male.Married.Widowed <- NULL
GermanCredit$Property.Unknown <- NULL
GermanCredit$Housing.ForFree <- NULL
inTrain <- createDataPartition(GermanCredit$Class, p = .8)[[1]]
GermanCreditTrain <- GermanCredit[ inTrain, ]
GermanCreditTest  <- GermanCredit[-inTrain, ]
library(kernlab)

## 
## 다음의 패키지를 부착합니다: 'kernlab'

## The following object is masked from 'package:ggplot2':
## 
##     alpha

#~. 는 독립변수들을 전부 집어 넣는다는 의미(마지막줄은 시험에꼭 나옴)
svmFit<-train(Class~.,
              data=GermanCreditTrain,
              method='svmRadial',
              preProc=c('center','scale'),
              tuneLength=10,
              trControl=trainControl(method="cv",
                                     repeats=5,
                                     classProbs = TRUE))

## Warning: `repeats` has no meaning for this resampling method.

svmFit

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 800 samples
##  41 predictor
##   2 classes: 'Bad', 'Good' 
## 
## Pre-processing: centered (41), scaled (41) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 720, 720, 720, 720, 720, 720, ... 
## Resampling results across tuning parameters:
## 
##   C       Accuracy  Kappa    
##     0.25  0.75625   0.3858886
##     0.50  0.76500   0.4087530
##     1.00  0.77000   0.3961280
##     2.00  0.76375   0.3782815
##     4.00  0.76500   0.3688453
##     8.00  0.77000   0.3892362
##    16.00  0.75125   0.3151667
##    32.00  0.73500   0.2797416
##    64.00  0.73250   0.2770492
##   128.00  0.72875   0.2713074
## 
## Tuning parameter 'sigma' was held constant at a value of 0.0138183
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.0138183 and C = 1.

library(randomForest)

## randomForest 4.7-1.1

## Type rfNews() to see new features/changes/bug fixes.

## 
## 다음의 패키지를 부착합니다: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

## The following object is masked from 'package:dplyr':
## 
##     combine

svmFit1<-train(Class~.,
              data=GermanCreditTrain,
              method='rf',
              preProc=c('center','scale'),
              tuneLength=10,
              trControl=trainControl(method="cv",
                                     repeats=5,
                                     classProbs = TRUE))

## Warning: `repeats` has no meaning for this resampling method.

svmFit1

## Random Forest 
## 
## 800 samples
##  41 predictor
##   2 classes: 'Bad', 'Good' 
## 
## Pre-processing: centered (41), scaled (41) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 720, 720, 720, 720, 720, 720, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy  Kappa    
##    2    0.73500   0.1888907
##    6    0.76375   0.3571717
##   10    0.77000   0.3815096
##   15    0.76500   0.3785229
##   19    0.76750   0.3918055
##   23    0.76500   0.3886457
##   28    0.76750   0.3954219
##   32    0.77000   0.4021965
##   36    0.76250   0.3870212
##   41    0.76625   0.3958864
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 10.

library(randomForest)
svmFit2<-train(Class~.,
              data=GermanCreditTrain,
              method='knn',
              preProc=c('center','scale'),
              tuneLength=10,
              trControl=trainControl(method="cv",
                                     repeats=5,
                                     classProbs = TRUE))

## Warning: `repeats` has no meaning for this resampling method.

svmFit2

## k-Nearest Neighbors 
## 
## 800 samples
##  41 predictor
##   2 classes: 'Bad', 'Good' 
## 
## Pre-processing: centered (41), scaled (41) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 720, 720, 720, 720, 720, 720, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy  Kappa    
##    5  0.71250   0.2453952
##    7  0.71625   0.2389015
##    9  0.72250   0.2360241
##   11  0.72250   0.2345050
##   13  0.72875   0.2389910
##   15  0.73500   0.2446996
##   17  0.74875   0.2797479
##   19  0.73750   0.2285858
##   21  0.73375   0.2148876
##   23  0.73500   0.2137900
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 17.

#해당되는 데이터는 svm이 가장 정확도가 높음

# tuneLength= 옵션은 평가되는 수를
#제어한다. PLS의 경우 이 함수는 1에서 tuneLength 까지의 정수 수열을 #사용한다. 1에서 15
# 사이의 모든 정수를 평가하려면 tuneLength=15로 설정하면 된다

plot(svmFit,scales=list(x=list(log=2)))

plot(svmFit1,scales=list(x=list(log=2)))

plot(svmFit2,scales=list(x=list(log=2)))

predicted<-predict(svmFit,GermanCreditTest)
# class별 확률을 확인할 때에는 'type'인자를 사용한다.
predictedProbs<-predict(svmFit,newdata=GermanCreditTest,
                        type="prob")
head(predictedProbs)

##         Bad      Good
## 1 0.4238216 0.5761784
## 2 0.0730835 0.9269165
## 3 0.5650184 0.4349816
## 4 0.6932819 0.3067181
## 5 0.1754360 0.8245640
## 6 0.3223266 0.6776734

predictedProbs1<-predict(svmFit1,newdata=GermanCreditTest,
                        type="prob")
head(predictedProbs1)

##      Bad  Good
## 4  0.604 0.396
## 7  0.054 0.946
## 16 0.480 0.520
## 19 0.738 0.262
## 22 0.186 0.814
## 33 0.340 0.660

predictedProbs2<-predict(svmFit2,newdata=GermanCreditTest,
                        type="prob")
head(predictedProbs2)

##          Bad      Good
## 1 0.41176471 0.5882353
## 2 0.05882353 0.9411765
## 3 0.35294118 0.6470588
## 4 0.41176471 0.5882353
## 5 0.23529412 0.7647059
## 6 0.35294118 0.6470588

bigdata_07-1

Noh Hyeon Uk

2023-01-10