230110_4th week1

rm(list=ls())
ls() # 내가 썻던 변수 이름을 다 불러옴

## character(0)

getwd()

## [1] "C:/data"

library(dplyr)

## Warning: 패키지 'dplyr'는 R 버전 4.2.2에서 작성되었습니다

## 
## 다음의 패키지를 부착합니다: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(caret)

## Warning: 패키지 'caret'는 R 버전 4.2.2에서 작성되었습니다

## 필요한 패키지를 로딩중입니다: ggplot2

## 필요한 패키지를 로딩중입니다: lattice

library(recipes)

## Warning: 패키지 'recipes'는 R 버전 4.2.2에서 작성되었습니다

## 
## 다음의 패키지를 부착합니다: 'recipes'

## The following object is masked from 'package:stats':
## 
##     step

library(ggplot2)

library(mlbench)

## Warning: 패키지 'mlbench'는 R 버전 4.2.2에서 작성되었습니다

data(Glass)
glimpse(Glass)

## Rows: 214
## Columns: 10
## $ RI   <dbl> 1.52101, 1.51761, 1.51618, 1.51766, 1.51742, 1.51596, 1.51743, 1.…
## $ Na   <dbl> 13.64, 13.89, 13.53, 13.21, 13.27, 12.79, 13.30, 13.15, 14.04, 13…
## $ Mg   <dbl> 4.49, 3.60, 3.55, 3.69, 3.62, 3.61, 3.60, 3.61, 3.58, 3.60, 3.46,…
## $ Al   <dbl> 1.10, 1.36, 1.54, 1.29, 1.24, 1.62, 1.14, 1.05, 1.37, 1.36, 1.56,…
## $ Si   <dbl> 71.78, 72.73, 72.99, 72.61, 73.08, 72.97, 73.09, 73.24, 72.08, 72…
## $ K    <dbl> 0.06, 0.48, 0.39, 0.57, 0.55, 0.64, 0.58, 0.57, 0.56, 0.57, 0.67,…
## $ Ca   <dbl> 8.75, 7.83, 7.78, 8.22, 8.07, 8.07, 8.17, 8.24, 8.30, 8.40, 8.09,…
## $ Ba   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ Fe   <dbl> 0.00, 0.00, 0.00, 0.00, 0.00, 0.26, 0.00, 0.00, 0.00, 0.11, 0.24,…
## $ Type <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…

table(Glass$Type)

## 
##  1  2  3  5  6  7 
## 70 76 17 13  9 29

pairs(Glass)

pairs(Glass[-10])

cor(Glass[,-10])

##               RI          Na           Mg          Al          Si            K
## RI  1.0000000000 -0.19188538 -0.122274039 -0.40732603 -0.54205220 -0.289832711
## Na -0.1918853790  1.00000000 -0.273731961  0.15679367 -0.06980881 -0.266086504
## Mg -0.1222740393 -0.27373196  1.000000000 -0.48179851 -0.16592672  0.005395667
## Al -0.4073260341  0.15679367 -0.481798509  1.00000000 -0.00552372  0.325958446
## Si -0.5420521997 -0.06980881 -0.165926723 -0.00552372  1.00000000 -0.193330854
## K  -0.2898327111 -0.26608650  0.005395667  0.32595845 -0.19333085  1.000000000
## Ca  0.8104026963 -0.27544249 -0.443750026 -0.25959201 -0.20873215 -0.317836155
## Ba -0.0003860189  0.32660288 -0.492262118  0.47940390 -0.10215131 -0.042618059
## Fe  0.1430096093 -0.24134641  0.083059529 -0.07440215 -0.09420073 -0.007719049
##            Ca            Ba           Fe
## RI  0.8104027 -0.0003860189  0.143009609
## Na -0.2754425  0.3266028795 -0.241346411
## Mg -0.4437500 -0.4922621178  0.083059529
## Al -0.2595920  0.4794039017 -0.074402151
## Si -0.2087322 -0.1021513105 -0.094200731
## K  -0.3178362 -0.0426180594 -0.007719049
## Ca  1.0000000 -0.1128409671  0.124968219
## Ba -0.1128410  1.0000000000 -0.058691755
## Fe  0.1249682 -0.0586917554  1.000000000

par(mfrow=c(1,2))
boxplot(Glass$Mg ~ Glass$Type)
boxplot(Glass$Al ~ Glass$Type)

library(corrplot)

## Warning: 패키지 'corrplot'는 R 버전 4.2.2에서 작성되었습니다

## corrplot 0.92 loaded

par(mfrow=c(1,2))
corrplot(cor(Glass[,-10]), order="hclust")

#skewness(왜도)
apply(iris[1:4],2,mean)

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width 
##     5.843333     3.057333     3.758000     1.199333

library(e1071)

## Warning: 패키지 'e1071'는 R 버전 4.2.2에서 작성되었습니다

apply(Glass[,-10], 2,skewness)

##         RI         Na         Mg         Al         Si          K         Ca 
##  1.6027151  0.4478343 -1.1364523  0.8946104 -0.7202392  6.4600889  2.0184463 
##         Ba         Fe 
##  3.3686800  1.7298107

trans<-preProcess(Glass[,-10],method=c("BoxCox","center","scale")) #center 표준화 변수 scale( 모든 변수를 0과 1사이로 바꿔줌)
trans

## Created from 214 samples and 9 variables
## 
## Pre-processing:
##   - Box-Cox transformation (5)
##   - centered (9)
##   - ignored (0)
##   - scaled (9)
## 
## Lambda estimates for Box-Cox transformation:
## -2, -0.1, 0.5, 2, -1.1

transformed<-predict(trans,Glass[,-10])
head(transformed[,1:5])

##           RI         Na        Mg          Al          Si
## 1  0.8756898  0.3133883 1.2517037 -0.65520274 -1.12729016
## 2 -0.2471367  0.6129977 0.6346799 -0.08726137  0.09719851
## 3 -0.7216425  0.1798164 0.6000157  0.27454124  0.43512776
## 4 -0.2305698 -0.2150217 0.6970756 -0.23439154 -0.05836211
## 5 -0.3101056 -0.1402661 0.6485456 -0.34194384  0.55238422
## 6 -0.7947626 -0.7480171 0.6416128  0.42852325  0.40909039

trans1<-preProcess(Glass[,-10], method=c("BoxCox","center","scale","pca"))
transformed1<-predict(trans1,Glass[,-10])
head(transformed1[,1:5])

##          PC1        PC2        PC3        PC4        PC5
## 1 -1.2126444 -0.3942139 -0.1730756 -1.7193852  0.1913387
## 2  0.6179073 -0.7020476 -0.5507034 -0.8575350  0.1566312
## 3  0.9907027 -0.8876886 -0.6452946 -0.3027716  0.1363025
## 4  0.1510212 -0.9042336 -0.1622361 -0.4521567  0.4291846
## 5  0.3582849 -1.0160965 -0.5763959 -0.1667831  0.3634192
## 6  0.3408017 -1.3565637  0.7451275  1.0568333 -1.7762845

set.seed(777)
library(mlbench)
data("BostonHousing")
BostonHousing[sample(1:nrow(BostonHousing),10),"crim"] <-NA
sample(1:46, 6)

## [1]  9 16  4 44 26 23

sample(1:46, 6)

## [1] 22 12 39  8 10 14

colSums(is.na(BostonHousing))

##    crim      zn   indus    chas     nox      rm     age     dis     rad     tax 
##      10       0       0       0       0       0       0       0       0       0 
## ptratio       b   lstat    medv 
##       0       0       0       0

Y<-BostonHousing$medv
X<-BostonHousing[, 1:5]
model<-caret::train(x=X, y=Y,method="rf",preProcess="medianImpute")
model

## Random Forest 
## 
## 506 samples
##   5 predictor
## 
## Pre-processing: median imputation (4), ignore (1) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 506, 506, 506, 506, 506, 506, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE     
##   2     6.188803  0.5601644  4.187659
##   3     6.295054  0.5488742  4.244309
##   5     6.584417  0.5148418  4.427385
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 2.

set.seed(777)
data(BostonHousing)
BostonHousing[BostonHousing$crim>0.5, "crim"]<-NA
Y<-BostonHousing$medv
X<-BostonHousing[,c(1:3,5)]

model_median<-caret::train(x=X,y=Y,method="glm",preProcess="medianImpute")
print(min(model_median$results$RMSE))

## [1] 7.870132

rm(list=ls())

library(AppliedPredictiveModeling)

## Warning: 패키지 'AppliedPredictiveModeling'는 R 버전 4.2.2에서 작성되었습니다

data(twoClassData)
head(predictors)

##   PredictorA PredictorB
## 1     0.1582     0.1609
## 2     0.6552     0.4918
## 3     0.7060     0.6333
## 4     0.1992     0.0881
## 5     0.3952     0.4152
## 6     0.4250     0.2988

glimpse(predictors)

## Rows: 208
## Columns: 2
## $ PredictorA <dbl> 0.1582, 0.6552, 0.7060, 0.1992, 0.3952, 0.4250, 0.0658, 0.3…
## $ PredictorB <dbl> 0.1609, 0.4918, 0.6333, 0.0881, 0.4152, 0.2988, 0.1786, 0.2…

class(predictors)

## [1] "data.frame"

glimpse(classes)

##  Factor w/ 2 levels "Class1","Class2": 2 2 2 2 2 2 2 2 2 2 ...

set.seed(1)

traingRows<-createDataPartition(classes,p=.8,list=FALSE) #평가데이터랑 8:2로 분리해서 관리
head(traingRows)

##      Resample1
## [1,]         1
## [2,]         2
## [3,]         3
## [4,]         7
## [5,]         8
## [6,]         9

trainPredictors<-predictors[traingRows,]
trainClasses<-classes[traingRows]
testPredictors<-predictors[-traingRows,]
testClasses<-classes[-traingRows]

str(trainPredictors)

## 'data.frame':    167 obs. of  2 variables:
##  $ PredictorA: num  0.1582 0.6552 0.706 0.0658 0.3086 ...
##  $ PredictorB: num  0.161 0.492 0.633 0.179 0.28 ...

str(testPredictors)

## 'data.frame':    41 obs. of  2 variables:
##  $ PredictorA: num  0.1992 0.3952 0.425 0.0847 0.2909 ...
##  $ PredictorB: num  0.0881 0.4152 0.2988 0.0548 0.3021 ...

repeatedSplit<-createDataPartition(trainClasses,p=0.8, times=3)
str(repeatedSplit)

## List of 3
##  $ Resample1: int [1:135] 1 2 3 4 5 6 8 9 10 11 ...
##  $ Resample2: int [1:135] 1 2 3 4 5 6 7 8 9 11 ...
##  $ Resample3: int [1:135] 1 2 3 4 5 6 9 10 11 14 ...

#k-fold
set.seed(1)
cvSplit<-createFolds(trainClasses,k=10,returnTrain = TRUE)
str(cvSplit)

## List of 10
##  $ Fold01: int [1:150] 1 2 4 5 6 7 8 10 11 13 ...
##  $ Fold02: int [1:150] 1 2 3 4 6 7 8 9 10 11 ...
##  $ Fold03: int [1:150] 1 3 4 5 6 7 8 9 10 11 ...
##  $ Fold04: int [1:150] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Fold05: int [1:150] 2 3 4 5 6 7 8 9 10 11 ...
##  $ Fold06: int [1:150] 1 2 3 4 5 6 7 8 9 11 ...
##  $ Fold07: int [1:150] 1 2 3 4 5 6 7 9 10 12 ...
##  $ Fold08: int [1:151] 1 2 3 4 5 6 8 9 10 11 ...
##  $ Fold09: int [1:151] 1 2 3 5 6 7 8 9 10 11 ...
##  $ Fold10: int [1:151] 1 2 3 4 5 7 8 9 10 11 ...

fold1<-cvSplit[[1]]
length(fold1)

## [1] 150

cvPredictors1<-trainPredictors[fold1,]
cvClasses1<-trainClasses[fold1]
nrow(trainPredictors)

## [1] 167

nrow(cvPredictors1)

## [1] 150

library(caret)

data("GermanCredit")
glimpse(GermanCredit)

## Rows: 1,000
## Columns: 62
## $ Duration                               <int> 6, 48, 12, 42, 24, 36, 24, 36, …
## $ Amount                                 <int> 1169, 5951, 2096, 7882, 4870, 9…
## $ InstallmentRatePercentage              <int> 4, 2, 2, 2, 3, 2, 3, 2, 2, 4, 3…
## $ ResidenceDuration                      <int> 4, 2, 3, 4, 4, 4, 4, 2, 4, 2, 1…
## $ Age                                    <int> 67, 22, 49, 45, 53, 35, 53, 35,…
## $ NumberExistingCredits                  <int> 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1…
## $ NumberPeopleMaintenance                <int> 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1…
## $ Telephone                              <dbl> 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1…
## $ ForeignWorker                          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Class                                  <fct> Good, Bad, Good, Good, Bad, Goo…
## $ CheckingAccountStatus.lt.0             <dbl> 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0…
## $ CheckingAccountStatus.0.to.200         <dbl> 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1…
## $ CheckingAccountStatus.gt.200           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CheckingAccountStatus.none             <dbl> 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0…
## $ CreditHistory.NoCredit.AllPaid         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CreditHistory.ThisBank.AllPaid         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ CreditHistory.PaidDuly                 <dbl> 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1…
## $ CreditHistory.Delay                    <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0…
## $ CreditHistory.Critical                 <dbl> 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0…
## $ Purpose.NewCar                         <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1…
## $ Purpose.UsedCar                        <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0…
## $ Purpose.Furniture.Equipment            <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0…
## $ Purpose.Radio.Television               <dbl> 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ Purpose.DomesticAppliance              <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Purpose.Repairs                        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Purpose.Education                      <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0…
## $ Purpose.Vacation                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Purpose.Retraining                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Purpose.Business                       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Purpose.Other                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ SavingsAccountBonds.lt.100             <dbl> 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1…
## $ SavingsAccountBonds.100.to.500         <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ SavingsAccountBonds.500.to.1000        <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ SavingsAccountBonds.gt.1000            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ SavingsAccountBonds.Unknown            <dbl> 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0…
## $ EmploymentDuration.lt.1                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ EmploymentDuration.1.to.4              <dbl> 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0…
## $ EmploymentDuration.4.to.7              <dbl> 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0…
## $ EmploymentDuration.gt.7                <dbl> 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0…
## $ EmploymentDuration.Unemployed          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ Personal.Male.Divorced.Seperated       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0…
## $ Personal.Female.NotSingle              <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1…
## $ Personal.Male.Single                   <dbl> 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0…
## $ Personal.Male.Married.Widowed          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0…
## $ Personal.Female.Single                 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ OtherDebtorsGuarantors.None            <dbl> 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1…
## $ OtherDebtorsGuarantors.CoApplicant     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ OtherDebtorsGuarantors.Guarantor       <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0…
## $ Property.RealEstate                    <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0…
## $ Property.Insurance                     <dbl> 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0…
## $ Property.CarOther                      <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1…
## $ Property.Unknown                       <dbl> 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0…
## $ OtherInstallmentPlans.Bank             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ OtherInstallmentPlans.Stores           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ OtherInstallmentPlans.None             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ Housing.Rent                           <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1…
## $ Housing.Own                            <dbl> 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0…
## $ Housing.ForFree                        <dbl> 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0…
## $ Job.UnemployedUnskilled                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ Job.UnskilledResident                  <dbl> 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0…
## $ Job.SkilledEmployee                    <dbl> 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1…
## $ Job.Management.SelfEmp.HighlyQualified <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0…

set.seed(1056)
table(GermanCredit$Class)

## 
##  Bad Good 
##  300  700

set.seed(1056)
GermanCredit <- GermanCredit[, -nearZeroVar(GermanCredit)]
GermanCredit$CheckingAccountStatus.lt.0 <- NULL
GermanCredit$SavingsAccountBonds.lt.100 <- NULL
GermanCredit$EmploymentDuration.lt.1 <- NULL
GermanCredit$EmploymentDuration.Unemployed <- NULL
GermanCredit$Personal.Male.Married.Widowed <- NULL
GermanCredit$Property.Unknown <- NULL
GermanCredit$Housing.ForFree <- NULL
inTrain <- createDataPartition(GermanCredit$Class, p = .8)[[1]] # [[1]]은 list=FALSE랑 같음
GermanCreditTrain <- GermanCredit[ inTrain, ]
GermanCreditTest  <- GermanCredit[-inTrain, ]

library(kernlab)

## 
## 다음의 패키지를 부착합니다: 'kernlab'

## The following object is masked from 'package:ggplot2':
## 
##     alpha

svmFit<-train(Class~.,
              data=GermanCreditTrain,
              method='svmRadial', #rf와 knn 정도만 사용하면 됨
              preProc=c('center','scale'),
              tuneLength=10, #출력값 10개 확인?
              trControl=trainControl(method="cv", #모양을 평가하는 방법, cv = k-fold 교차검증
                                     repeats=5,
                                     classProbs = TRUE)) #classProbs는 확률값을 표현해주는 함수

## Warning: `repeats` has no meaning for this resampling method.

svmFit

## Support Vector Machines with Radial Basis Function Kernel 
## 
## 800 samples
##  41 predictor
##   2 classes: 'Bad', 'Good' 
## 
## Pre-processing: centered (41), scaled (41) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 720, 720, 720, 720, 720, 720, ... 
## Resampling results across tuning parameters:
## 
##   C       Accuracy  Kappa    
##     0.25  0.75625   0.3858886
##     0.50  0.76500   0.4087530
##     1.00  0.77000   0.3961280
##     2.00  0.76375   0.3782815
##     4.00  0.76500   0.3688453
##     8.00  0.77000   0.3892362
##    16.00  0.75125   0.3151667
##    32.00  0.73500   0.2797416
##    64.00  0.73250   0.2770492
##   128.00  0.72875   0.2713074
## 
## Tuning parameter 'sigma' was held constant at a value of 0.0138183
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were sigma = 0.0138183 and C = 1.

rfFit<-train(Class~.,
              data=GermanCreditTrain,
              method='rf',
              preProc=c('center','scale'),
              tuneLength=10,
              trControl=trainControl(method="cv",
                                     repeats=5,
                                     classProbs = TRUE))

## Warning: `repeats` has no meaning for this resampling method.

rfFit

## Random Forest 
## 
## 800 samples
##  41 predictor
##   2 classes: 'Bad', 'Good' 
## 
## Pre-processing: centered (41), scaled (41) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 720, 720, 720, 720, 720, 720, ... 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy  Kappa    
##    2    0.73500   0.1888907
##    6    0.76375   0.3571717
##   10    0.77000   0.3815096
##   15    0.76500   0.3785229
##   19    0.76750   0.3918055
##   23    0.76500   0.3886457
##   28    0.76750   0.3954219
##   32    0.77000   0.4021965
##   36    0.76250   0.3870212
##   41    0.76625   0.3958864
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 10.

knnFit<-train(Class~.,
              data=GermanCreditTrain,
              method='knn',
              preProc=c('center','scale'),
              tuneLength=10,
              trControl=trainControl(method="cv",
                                     repeats=5,
                                     classProbs = TRUE))

## Warning: `repeats` has no meaning for this resampling method.

knnFit

## k-Nearest Neighbors 
## 
## 800 samples
##  41 predictor
##   2 classes: 'Bad', 'Good' 
## 
## Pre-processing: centered (41), scaled (41) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 720, 720, 720, 720, 720, 720, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy  Kappa    
##    5  0.71250   0.2453952
##    7  0.71625   0.2389015
##    9  0.72250   0.2360241
##   11  0.72250   0.2345050
##   13  0.72875   0.2389910
##   15  0.73500   0.2446996
##   17  0.74875   0.2797479
##   19  0.73750   0.2285858
##   21  0.73375   0.2148876
##   23  0.73500   0.2137900
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 17.

knnFit1<-train(Class~.,
              data=GermanCreditTrain,
              method='knn',
              preProc=c('BoxCox', 'center','scale'),
              tuneLength=5,
              trControl=trainControl(method="cv",
                                     repeats=5,
                                     classProbs = TRUE))

## Warning: `repeats` has no meaning for this resampling method.

knnFit1

## k-Nearest Neighbors 
## 
## 800 samples
##  41 predictor
##   2 classes: 'Bad', 'Good' 
## 
## Pre-processing: Box-Cox transformation (6), centered (41), scaled (41) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 720, 720, 720, 720, 720, 720, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy  Kappa    
##    5  0.71875   0.2556639
##    7  0.71875   0.2354375
##    9  0.71875   0.2324534
##   11  0.72000   0.2388210
##   13  0.73125   0.2545857
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 13.

knnFit2<-train(Class~.,
               data=GermanCreditTrain,
               method='knn',
               preProc=c('BoxCox', 'center','scale'),
               tuneLength=10,
               trControl=trainControl(method="cv",
                                      repeats=5,
                                      classProbs = TRUE))

## Warning: `repeats` has no meaning for this resampling method.

knnFit2

## k-Nearest Neighbors 
## 
## 800 samples
##  41 predictor
##   2 classes: 'Bad', 'Good' 
## 
## Pre-processing: Box-Cox transformation (6), centered (41), scaled (41) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 720, 720, 720, 720, 720, 720, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy  Kappa    
##    5  0.72000   0.2741181
##    7  0.72875   0.2657760
##    9  0.71875   0.2270198
##   11  0.73000   0.2482702
##   13  0.73125   0.2533382
##   15  0.73625   0.2444686
##   17  0.73375   0.2360879
##   19  0.72750   0.2109112
##   21  0.73375   0.2235829
##   23  0.73500   0.2218212
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 15.

knnFit3<-train(Class~.,
               data=GermanCreditTrain,
               method='knn',
               preProc=c('BoxCox', 'center','scale', 'pca'),
               tuneLength=5,
               trControl=trainControl(method="cv",
                                      repeats=5,
                                      classProbs = TRUE))

## Warning: `repeats` has no meaning for this resampling method.

knnFit3

## k-Nearest Neighbors 
## 
## 800 samples
##  41 predictor
##   2 classes: 'Bad', 'Good' 
## 
## Pre-processing: Box-Cox transformation (6), centered (41), scaled
##  (41), principal component signal extraction (41) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 720, 720, 720, 720, 720, 720, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy  Kappa    
##    5  0.71000   0.2436294
##    7  0.71875   0.2511040
##    9  0.71375   0.2224824
##   11  0.72375   0.2446006
##   13  0.73375   0.2610622
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 13.

plot(svmFit, scales=list(x=list(llog=2)))

## Warning in complete_names(x, x.scales): Invalid or ambiguous component names:
## llog

predicted<-predict(svmFit,GermanCreditTest)
predictedProbs<-predict(svmFit, newdata=GermanCreditTest, typ="prob")

head(predicted)

## [1] Good Good Bad  Bad  Good Good
## Levels: Bad Good

head(predictedProbs)

##         Bad      Good
## 1 0.4238216 0.5761784
## 2 0.0730835 0.9269165
## 3 0.5650184 0.4349816
## 4 0.6932819 0.3067181
## 5 0.1754360 0.8245640
## 6 0.3223266 0.6776734

230110_4th week1

이동건

2023-01-10