The Author prepares the lab4 report and his finding as follow. First, the author reads and cleans the data collected, then selects the features revelant to the classfier; afterward, the author build a classsifier and evaluate the casssifier, finally, the author conclude the knn( k=7), random forest (mtry =2), logistic linear model produces the accuracy of 85% ,84% and 67% respectely, in term of the stage III Melanoma case study.
dim(pheno)
## [1] 79 49
fit= lmFit(gset[, row.names(design)], design, method = "robust")
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
efit = eBayes(fit, trend = TRUE)
dt_fdr <- decideTests(efit)
summary(dt_fdr)
## (Intercept) survClassPoor
## Down 0 264
## NotSig 0 25782
## Up 26085 39
top10 <- topTable(efit, n = 10)
## Removing intercept from test coefficients
X <- t(expVal[row.names(top10), row.names(design)])
# Outcome variable
Y <- as.factor(pheno[row.names(design), "survClass"])
# NEW DATASET
new_Date <- data.frame(Y,X)
#new_Date
First, RandomForest model
library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:Biobase':
##
## combine
## The following object is masked from 'package:BiocGenerics':
##
## combine
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
fit <- randomForest(Y ~ ., new_Date)
predRF <- predict(fit, new_Date)
Second, logistic liner regression model
fit <- glm(Y~., new_Date, family = binomial)
preLR <- predict(fit, new_Date, type = "response")
predLR <- factor(preLR > 0.5, levels = c(FALSE, TRUE), labels = c("Good", "Poor"))
Perform knn model
library(class)
train <- dplyr::select(new_Date, -Y)
test <- dplyr::select(new_Date, -Y)
c1 <- Y
c1 <- as.factor(c1)
preedKNN1 <- knn(train, test, c1, k = 1)
preedKNN10 <- knn(train, test, c1, k = 10)
perform logistic liner model and evaluate the model using 5 cross validation method
set.seed(51773)
# Load necessary library
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# Define the control function for train()
ctrl <- trainControl(method = "cv", number = 5, savePredictions = "final")
# Train the model
glm_model <- train(Y ~ ., data = new_Date, method = "glm", trControl = ctrl)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Print model details
print(glm_model)
## Generalized Linear Model
##
## 49 samples
## 10 predictors
## 2 classes: 'Good', 'Poor'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 39, 39, 39, 40, 39
## Resampling results:
##
## Accuracy Kappa
## 0.6733333 0.3544186
# Check cross-validated results
print(glm_model$resample)
## Accuracy Kappa Resample
## 1 0.5000000 0.000000 Fold1
## 2 0.7000000 0.400000 Fold2
## 3 0.6000000 0.200000 Fold3
## 4 0.6666667 0.372093 Fold4
## 5 0.9000000 0.800000 Fold5
perform knn model and evaluate the model using 5 cross validation method
# Load necessary library
set.seed(51773)
library(caret)
# Define the control function for train()
ctrl <- trainControl(method = "cv", number = 5, savePredictions = "final")
# Train the model
knn_model <- train(Y ~ ., data = new_Date, method = "knn", trControl = ctrl)
# Print model details
print(knn_model)
## k-Nearest Neighbors
##
## 49 samples
## 10 predictors
## 2 classes: 'Good', 'Poor'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 39, 39, 39, 40, 39
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.8577778 0.7160976
## 7 0.8577778 0.7160976
## 9 0.8377778 0.6760976
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.
# Check cross-validated results
print(knn_model$resample)
## Accuracy Kappa Resample
## 1 0.8000000 0.6000000 Fold1
## 2 0.6000000 0.2000000 Fold3
## 3 1.0000000 1.0000000 Fold2
## 4 1.0000000 1.0000000 Fold5
## 5 0.8888889 0.7804878 Fold4
Simple linear logistic model and random forest accuracy output
mean(predRF != new_Date$Y)
## [1] 0
mean(predLR != new_Date$Y)
## [1] 0.244898
perform random forest model and evaluate the model using 5 cross validation method
set.seed(51773)
# Load necessary libraries
library(caret)
library(randomForest)
# Define the control function for train()
ctrl <- trainControl(method = "cv", number = 5, savePredictions = "final")
# Train the model
rf_model <- train(Y ~ ., data = new_Date, method = "rf", trControl = ctrl)
# Print model details
print(rf_model)
## Random Forest
##
## 49 samples
## 10 predictors
## 2 classes: 'Good', 'Poor'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 39, 39, 39, 40, 39
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8355556 0.6742857
## 6 0.8177778 0.6360976
## 10 0.8155556 0.6342857
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
# Check cross-validated results
print(rf_model$resample)
## Accuracy Kappa Resample
## 1 0.7000000 0.4000000 Fold1
## 2 1.0000000 1.0000000 Fold2
## 3 1.0000000 1.0000000 Fold5
## 4 0.7777778 0.5714286 Fold4
## 5 0.7000000 0.4000000 Fold3
I conclude that the knn(K=7) approach produce the best result, when using 5 cross validation with the best 10 features.