cách lấy mẫu:
# Define the training control
#fitControl <- trainControl(
# method = 'cv', # k-fold cross validation
# number = 5, # number of folds
# savePredictions = 'final', # saves predictions for optimal tuning parameter
# classProbs = T, # should class probabilities be returned
# summaryFunction=twoClassSummary # results summary function
#)
dulieu <-iris
dulieu1 <-dulieu
head(dulieu)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
dim(dulieu)
## [1] 150 5
colnames(dulieu) <-c("bienA", "bienB", "bienC", "bienD", "bienE")
head(dulieu)
## bienA bienB bienC bienD bienE
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
chiadl <- createDataPartition(dulieu$bienA, p=0.8, list=F)
traindl <-dulieu[chiadl,]
testdl <-dulieu[-chiadl,]
traindata <- traindl[,1:4]
classdata <- traindl[,5]
huanluyen <- trainControl(method = "cv")
knn1 <- train(traindata, classdata,
method = "knn",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = huanluyen)
knn1
## k-Nearest Neighbors
##
## 121 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## Pre-processing: centered (4), scaled (4)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 110, 109, 109, 109, 108, 108, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.9596154 0.9392839
## 7 0.9596154 0.9392839
## 9 0.9512821 0.9267839
## 11 0.9679487 0.9517839
## 13 0.9596154 0.9392839
## 15 0.9505245 0.9253598
## 17 0.9421911 0.9128598
## 19 0.9185897 0.8773649
## 21 0.9102564 0.8648649
## 23 0.9025641 0.8527273
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 11.
confusionMatrix(knn1)
## Cross-Validated (10 fold) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction setosa versicolor virginica
## setosa 33.1 0.0 0.0
## versicolor 0.0 33.9 1.7
## virginica 0.0 1.7 29.8
##
## Accuracy (average) : 0.9669
predict(knn1,testdl) ->uocluong1
confusionMatrix(uocluong1, testdl[,5])
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 7 1
## virginica 0 0 11
##
## Overall Statistics
##
## Accuracy : 0.9655
## 95% CI : (0.8224, 0.9991)
## No Information Rate : 0.4138
## P-Value [Acc > NIR] : 3.242e-10
##
## Kappa : 0.9476
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 1.0000 0.9167
## Specificity 1.0000 0.9545 1.0000
## Pos Pred Value 1.0000 0.8750 1.0000
## Neg Pred Value 1.0000 1.0000 0.9444
## Prevalence 0.3448 0.2414 0.4138
## Detection Rate 0.3448 0.2414 0.3793
## Detection Prevalence 0.3448 0.2759 0.3793
## Balanced Accuracy 1.0000 0.9773 0.9583
library(MASS)
nnet1 <- train(traindata, classdata,
method = "nnet",
preProcess = "range",
tuneLength = 2,
trace = FALSE,
maxit = 100)
nnet1
## Neural Network
##
## 121 samples
## 4 predictor
## 3 classes: 'setosa', 'versicolor', 'virginica'
##
## Pre-processing: re-scaling to [0, 1] (4)
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 121, 121, 121, 121, 121, 121, ...
## Resampling results across tuning parameters:
##
## size decay Accuracy Kappa
## 1 0.0 0.8334068 0.7508701
## 1 0.1 0.9606782 0.9403029
## 3 0.0 0.9380184 0.9057234
## 3 0.1 0.9605004 0.9397472
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 1 and decay = 0.1.
confusionMatrix(nnet1)
## Bootstrapped (25 reps) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction setosa versicolor virginica
## setosa 35.2 0.0 0.0
## versicolor 0.0 32.2 0.6
## virginica 0.0 3.3 28.6
##
## Accuracy (average) : 0.9608
predict(nnet1,testdl) ->uocluong2
confusionMatrix(uocluong2, testdl[,5])
## Confusion Matrix and Statistics
##
## Reference
## Prediction setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 6 0
## virginica 0 1 12
##
## Overall Statistics
##
## Accuracy : 0.9655
## 95% CI : (0.8224, 0.9991)
## No Information Rate : 0.4138
## P-Value [Acc > NIR] : 3.242e-10
##
## Kappa : 0.9466
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: setosa Class: versicolor Class: virginica
## Sensitivity 1.0000 0.8571 1.0000
## Specificity 1.0000 1.0000 0.9412
## Pos Pred Value 1.0000 1.0000 0.9231
## Neg Pred Value 1.0000 0.9565 1.0000
## Prevalence 0.3448 0.2414 0.4138
## Detection Rate 0.3448 0.2069 0.4138
## Detection Prevalence 0.3448 0.2069 0.4483
## Balanced Accuracy 1.0000 0.9286 0.9706
congthuc <- bienA ~ bienB + bienC + bienD
lm <-lm(congthuc, dulieu)
summary(lm)
##
## Call:
## lm(formula = congthuc, data = dulieu)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.82816 -0.21989 0.01875 0.19709 0.84570
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.85600 0.25078 7.401 9.85e-12 ***
## bienB 0.65084 0.06665 9.765 < 2e-16 ***
## bienC 0.70913 0.05672 12.502 < 2e-16 ***
## bienD -0.55648 0.12755 -4.363 2.41e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3145 on 146 degrees of freedom
## Multiple R-squared: 0.8586, Adjusted R-squared: 0.8557
## F-statistic: 295.5 on 3 and 146 DF, p-value: < 2.2e-16
lm1 <- train( congthuc,
data=dulieu,
method="lm")
lm1
## Linear Regression
##
## 150 samples
## 3 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 150, 150, 150, 150, 150, 150, ...
## Resampling results:
##
## RMSE Rsquared MAE
## 0.3175866 0.8588323 0.2585811
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
k = data.frame(.k=c(seq(1, 10, 1)))
lm2 <- train( congthuc,
data=traindl,
method="knn",
trControl = trainControl(method = "cv", number=10),
tuneGrid =k)
lm2
## k-Nearest Neighbors
##
## 121 samples
## 3 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 110, 110, 109, 109, 109, 108, ...
## Resampling results across tuning parameters:
##
## k RMSE Rsquared MAE
## 1 0.4008668 0.7768369 0.3188589
## 2 0.3667971 0.8024508 0.2901566
## 3 0.3650434 0.8138022 0.2948610
## 4 0.3537024 0.8347009 0.2869587
## 5 0.3566483 0.8245378 0.2885367
## 6 0.3663488 0.8156912 0.2949729
## 7 0.3654594 0.8156979 0.2947473
## 8 0.3656054 0.8154683 0.2941686
## 9 0.3635110 0.8154435 0.2941193
## 10 0.3563023 0.8252836 0.2915722
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 4.
predict(lm2, testdl[-1])
## [1] 4.600000 4.816667 5.071429 5.300000 5.225000 4.966667 4.900000 4.600000
## [9] 5.325000 5.375000 5.350000 6.540000 5.742857 5.980000 5.250000 5.650000
## [17] 6.620000 6.200000 7.325000 6.575000 6.400000 6.600000 6.200000 7.600000
## [25] 6.500000 6.100000 6.600000 6.500000 6.625000
library(rpart)
rpar1 <- train(congthuc,
data = dulieu,
method = "rpart",
tuneLength = 10)
## Warning in nominalTrainWorkflow(x = x, y = y, wts = weights, info = trainInfo,
## : There were missing values in resampled performance measures.
rpar1
## CART
##
## 150 samples
## 3 predictor
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 150, 150, 150, 150, 150, 150, ...
## Resampling results across tuning parameters:
##
## cp RMSE Rsquared MAE
## 0.003265012 0.3922836 0.7737134 0.3133693
## 0.005721396 0.3941313 0.7716523 0.3133411
## 0.006922562 0.3930462 0.7721918 0.3121687
## 0.008358800 0.3962493 0.7688987 0.3145670
## 0.016980371 0.4161701 0.7471649 0.3300327
## 0.023031646 0.4265943 0.7338607 0.3378040
## 0.029804524 0.4282205 0.7308005 0.3374651
## 0.057188720 0.4533376 0.6952094 0.3619504
## 0.121807006 0.5126831 0.6169138 0.4126823
## 0.613462371 0.6254975 0.5601944 0.5104314
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was cp = 0.003265012.
dulieu1[1:3,1] <-NA
library(RANN) # required for knnInpute
dulieu2 <- preProcess(dulieu1, method='knnImpute')
dulieu3 <-predict(dulieu2, newdata = dulieu1)
head(dulieu3)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 -0.8031618 1.01560199 -1.335752 -1.311052 setosa
## 2 -1.4577304 -0.13153881 -1.335752 -1.311052 setosa
## 3 -1.3607573 0.32731751 -1.392399 -1.311052 setosa
## 4 -1.5304603 0.09788935 -1.279104 -1.311052 setosa
## 5 -1.0455946 1.24503015 -1.335752 -1.311052 setosa
## 6 -0.5607290 1.93331463 -1.165809 -1.048667 setosa
#range: Normalize values so it ranges between 0 and 1
#center: Subtract Mean
#scale: Divide by standard deviation
#BoxCox: Remove skewness leading to normality. Values must be > 0
#YeoJohnson: Like BoxCox, but works for negative values.
#expoTrans: Exponential transformation, works for negative values.
#pca: Replace with principal components
#ica: Replace with independent components
#spatialSign: Project the data to a unit circle
dulieu4 <-preProcess(dulieu1, method="bagImpute")
predict(dulieu4, newdata=dulieu1)->dulieu4.2
head(dulieu4.2)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.026926 3.5 1.4 0.2 setosa
## 2 4.819678 3.0 1.4 0.2 setosa
## 3 4.772678 3.2 1.3 0.2 setosa
## 4 4.600000 3.1 1.5 0.2 setosa
## 5 5.000000 3.6 1.4 0.2 setosa
## 6 5.400000 3.9 1.7 0.4 setosa