1. ML theo phân loại dữ liệu

cách lấy mẫu:

# Define the training control
#fitControl <- trainControl(
#    method = 'cv',                   # k-fold cross validation
#    number = 5,                      # number of folds
#    savePredictions = 'final',       # saves predictions for optimal tuning parameter
#    classProbs = T,                  # should class probabilities be returned
#    summaryFunction=twoClassSummary  # results summary function
#) 

1.1 Phân loại thuật toán knn với cách lấy mẫu cv

dulieu <-iris
dulieu1 <-dulieu
head(dulieu)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa
dim(dulieu)
## [1] 150   5
colnames(dulieu) <-c("bienA", "bienB", "bienC", "bienD", "bienE")
head(dulieu)
##   bienA bienB bienC bienD  bienE
## 1   5.1   3.5   1.4   0.2 setosa
## 2   4.9   3.0   1.4   0.2 setosa
## 3   4.7   3.2   1.3   0.2 setosa
## 4   4.6   3.1   1.5   0.2 setosa
## 5   5.0   3.6   1.4   0.2 setosa
## 6   5.4   3.9   1.7   0.4 setosa
chiadl <- createDataPartition(dulieu$bienA, p=0.8, list=F)
traindl <-dulieu[chiadl,]
testdl <-dulieu[-chiadl,]
traindata <- traindl[,1:4]
classdata <- traindl[,5]

huanluyen <- trainControl(method = "cv")

knn1 <- train(traindata, classdata,
              method = "knn",
              preProcess = c("center", "scale"),
              tuneLength = 10,
              trControl = huanluyen)

knn1
## k-Nearest Neighbors 
## 
## 121 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## Pre-processing: centered (4), scaled (4) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 110, 109, 109, 109, 108, 108, ... 
## Resampling results across tuning parameters:
## 
##   k   Accuracy   Kappa    
##    5  0.9596154  0.9392839
##    7  0.9596154  0.9392839
##    9  0.9512821  0.9267839
##   11  0.9679487  0.9517839
##   13  0.9596154  0.9392839
##   15  0.9505245  0.9253598
##   17  0.9421911  0.9128598
##   19  0.9185897  0.8773649
##   21  0.9102564  0.8648649
##   23  0.9025641  0.8527273
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 11.
confusionMatrix(knn1)
## Cross-Validated (10 fold) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##             Reference
## Prediction   setosa versicolor virginica
##   setosa       33.1        0.0       0.0
##   versicolor    0.0       33.9       1.7
##   virginica     0.0        1.7      29.8
##                             
##  Accuracy (average) : 0.9669
predict(knn1,testdl) ->uocluong1
confusionMatrix(uocluong1, testdl[,5])
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0          7         1
##   virginica       0          0        11
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9655          
##                  95% CI : (0.8224, 0.9991)
##     No Information Rate : 0.4138          
##     P-Value [Acc > NIR] : 3.242e-10       
##                                           
##                   Kappa : 0.9476          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            1.0000           0.9167
## Specificity                 1.0000            0.9545           1.0000
## Pos Pred Value              1.0000            0.8750           1.0000
## Neg Pred Value              1.0000            1.0000           0.9444
## Prevalence                  0.3448            0.2414           0.4138
## Detection Rate              0.3448            0.2414           0.3793
## Detection Prevalence        0.3448            0.2759           0.3793
## Balanced Accuracy           1.0000            0.9773           0.9583

1.2 Phân loại theo Neural Network

library(MASS)
nnet1 <- train(traindata, classdata,
                method = "nnet",
                preProcess = "range",
                tuneLength = 2,
                trace = FALSE,
                maxit = 100)

nnet1
## Neural Network 
## 
## 121 samples
##   4 predictor
##   3 classes: 'setosa', 'versicolor', 'virginica' 
## 
## Pre-processing: re-scaling to [0, 1] (4) 
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 121, 121, 121, 121, 121, 121, ... 
## Resampling results across tuning parameters:
## 
##   size  decay  Accuracy   Kappa    
##   1     0.0    0.8334068  0.7508701
##   1     0.1    0.9606782  0.9403029
##   3     0.0    0.9380184  0.9057234
##   3     0.1    0.9605004  0.9397472
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 1 and decay = 0.1.
confusionMatrix(nnet1)
## Bootstrapped (25 reps) Confusion Matrix 
## 
## (entries are percentual average cell counts across resamples)
##  
##             Reference
## Prediction   setosa versicolor virginica
##   setosa       35.2        0.0       0.0
##   versicolor    0.0       32.2       0.6
##   virginica     0.0        3.3      28.6
##                             
##  Accuracy (average) : 0.9608
predict(nnet1,testdl) ->uocluong2
confusionMatrix(uocluong2, testdl[,5])
## Confusion Matrix and Statistics
## 
##             Reference
## Prediction   setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0          6         0
##   virginica       0          1        12
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9655          
##                  95% CI : (0.8224, 0.9991)
##     No Information Rate : 0.4138          
##     P-Value [Acc > NIR] : 3.242e-10       
##                                           
##                   Kappa : 0.9466          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: setosa Class: versicolor Class: virginica
## Sensitivity                 1.0000            0.8571           1.0000
## Specificity                 1.0000            1.0000           0.9412
## Pos Pred Value              1.0000            1.0000           0.9231
## Neg Pred Value              1.0000            0.9565           1.0000
## Prevalence                  0.3448            0.2414           0.4138
## Detection Rate              0.3448            0.2069           0.4138
## Detection Prevalence        0.3448            0.2069           0.4483
## Balanced Accuracy           1.0000            0.9286           0.9706

2 ML cho hồi quy

2.1 hồi quy ols

congthuc <- bienA ~ bienB + bienC + bienD
lm <-lm(congthuc, dulieu)
summary(lm)
## 
## Call:
## lm(formula = congthuc, data = dulieu)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.82816 -0.21989  0.01875  0.19709  0.84570 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.85600    0.25078   7.401 9.85e-12 ***
## bienB        0.65084    0.06665   9.765  < 2e-16 ***
## bienC        0.70913    0.05672  12.502  < 2e-16 ***
## bienD       -0.55648    0.12755  -4.363 2.41e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.3145 on 146 degrees of freedom
## Multiple R-squared:  0.8586, Adjusted R-squared:  0.8557 
## F-statistic: 295.5 on 3 and 146 DF,  p-value: < 2.2e-16
lm1 <- train( congthuc,
              data=dulieu,
              method="lm")
lm1
## Linear Regression 
## 
## 150 samples
##   3 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 150, 150, 150, 150, 150, 150, ... 
## Resampling results:
## 
##   RMSE       Rsquared   MAE      
##   0.3175866  0.8588323  0.2585811
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
k = data.frame(.k=c(seq(1, 10, 1)))
lm2 <- train( congthuc,
              data=traindl,
              method="knn",
              trControl = trainControl(method = "cv", number=10),
              tuneGrid =k)
lm2
## k-Nearest Neighbors 
## 
## 121 samples
##   3 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 110, 110, 109, 109, 109, 108, ... 
## Resampling results across tuning parameters:
## 
##   k   RMSE       Rsquared   MAE      
##    1  0.4008668  0.7768369  0.3188589
##    2  0.3667971  0.8024508  0.2901566
##    3  0.3650434  0.8138022  0.2948610
##    4  0.3537024  0.8347009  0.2869587
##    5  0.3566483  0.8245378  0.2885367
##    6  0.3663488  0.8156912  0.2949729
##    7  0.3654594  0.8156979  0.2947473
##    8  0.3656054  0.8154683  0.2941686
##    9  0.3635110  0.8154435  0.2941193
##   10  0.3563023  0.8252836  0.2915722
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 4.
predict(lm2, testdl[-1]) 
##  [1] 4.600000 4.816667 5.071429 5.300000 5.225000 4.966667 4.900000 4.600000
##  [9] 5.325000 5.375000 5.350000 6.540000 5.742857 5.980000 5.250000 5.650000
## [17] 6.620000 6.200000 7.325000 6.575000 6.400000 6.600000 6.200000 7.600000
## [25] 6.500000 6.100000 6.600000 6.500000 6.625000
library(rpart)
rpar1 <- train(congthuc,
              data = dulieu,
              method = "rpart",
              tuneLength = 10)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
rpar1
## CART 
## 
## 150 samples
##   3 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 150, 150, 150, 150, 150, 150, ... 
## Resampling results across tuning parameters:
## 
##   cp           RMSE       Rsquared   MAE      
##   0.003265012  0.3922836  0.7737134  0.3133693
##   0.005721396  0.3941313  0.7716523  0.3133411
##   0.006922562  0.3930462  0.7721918  0.3121687
##   0.008358800  0.3962493  0.7688987  0.3145670
##   0.016980371  0.4161701  0.7471649  0.3300327
##   0.023031646  0.4265943  0.7338607  0.3378040
##   0.029804524  0.4282205  0.7308005  0.3374651
##   0.057188720  0.4533376  0.6952094  0.3619504
##   0.121807006  0.5126831  0.6169138  0.4126823
##   0.613462371  0.6254975  0.5601944  0.5104314
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.003265012.

2.2 Hồi quy logit

3 Xử lý dữ liệu rỗng

dulieu1[1:3,1] <-NA
library(RANN)  # required for knnInpute
dulieu2 <- preProcess(dulieu1, method='knnImpute')
dulieu3 <-predict(dulieu2, newdata = dulieu1)
head(dulieu3)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1   -0.8031618  1.01560199    -1.335752   -1.311052  setosa
## 2   -1.4577304 -0.13153881    -1.335752   -1.311052  setosa
## 3   -1.3607573  0.32731751    -1.392399   -1.311052  setosa
## 4   -1.5304603  0.09788935    -1.279104   -1.311052  setosa
## 5   -1.0455946  1.24503015    -1.335752   -1.311052  setosa
## 6   -0.5607290  1.93331463    -1.165809   -1.048667  setosa
#range: Normalize values so it ranges between 0 and 1
#center: Subtract Mean
#scale: Divide by standard deviation
#BoxCox: Remove skewness leading to normality. Values must be > 0
#YeoJohnson: Like BoxCox, but works for negative values.
#expoTrans: Exponential transformation, works for negative values.
#pca: Replace with principal components
#ica: Replace with independent components
#spatialSign: Project the data to a unit circle

dulieu4 <-preProcess(dulieu1, method="bagImpute")
predict(dulieu4, newdata=dulieu1)->dulieu4.2
head(dulieu4.2)
##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1     5.026926         3.5          1.4         0.2  setosa
## 2     4.819678         3.0          1.4         0.2  setosa
## 3     4.772678         3.2          1.3         0.2  setosa
## 4     4.600000         3.1          1.5         0.2  setosa
## 5     5.000000         3.6          1.4         0.2  setosa
## 6     5.400000         3.9          1.7         0.4  setosa