Introduction

The Author prepares the lab4 report and his finding as follow. First, the author reads and cleans the data collected, then selects the features revelant to the classfier; afterward, the author build a classsifier and evaluate the casssifier, finally, the author conclude the knn( k=7), random forest (mtry =2), logistic linear model produces the accuracy of 85% ,84% and 67% respectely, in term of the stage III Melanoma case study.

Reading and Cleaning Dtat

dim(pheno)
## [1] 79 49
fit= lmFit(gset[, row.names(design)], design, method = "robust")
## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps

## Warning in rlm.default(x = X, y = y, weights = w, ...): 'rlm' failed to
## converge in 20 steps
efit = eBayes(fit, trend = TRUE)
dt_fdr <- decideTests(efit)
summary(dt_fdr)
##        (Intercept) survClassPoor
## Down             0           264
## NotSig           0         25782
## Up           26085            39

Select the top 10 Feture

top10 <- topTable(efit, n = 10)
## Removing intercept from test coefficients
X <- t(expVal[row.names(top10), row.names(design)])
# Outcome variable
Y <- as.factor(pheno[row.names(design), "survClass"])
# NEW DATASET
new_Date <- data.frame(Y,X)
#new_Date

Building the classifier using 5-cross validation with stage III Melanoma case study include feature selection

First, RandomForest model

library(randomForest)
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:Biobase':
## 
##     combine
## The following object is masked from 'package:BiocGenerics':
## 
##     combine
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
fit <- randomForest(Y ~ ., new_Date)
predRF <- predict(fit, new_Date)

Second, logistic liner regression model

fit <- glm(Y~., new_Date, family = binomial)
preLR <- predict(fit, new_Date, type = "response")

predLR <- factor(preLR > 0.5, levels = c(FALSE, TRUE), labels = c("Good", "Poor"))

Perform knn model

library(class)

train <- dplyr::select(new_Date, -Y)

test <- dplyr::select(new_Date, -Y)
c1 <- Y
c1 <- as.factor(c1)

preedKNN1 <- knn(train, test, c1, k = 1)
preedKNN10 <- knn(train, test, c1, k = 10)

perform logistic liner model and evaluate the model using 5 cross validation method

set.seed(51773)

# Load necessary library
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
# Define the control function for train()
ctrl <- trainControl(method = "cv", number = 5, savePredictions = "final")

# Train the model
glm_model <- train(Y ~ ., data = new_Date, method = "glm", trControl = ctrl)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# Print model details
print(glm_model)
## Generalized Linear Model 
## 
## 49 samples
## 10 predictors
##  2 classes: 'Good', 'Poor' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 39, 39, 39, 40, 39 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.6733333  0.3544186
# Check cross-validated results
print(glm_model$resample)
##    Accuracy    Kappa Resample
## 1 0.5000000 0.000000    Fold1
## 2 0.7000000 0.400000    Fold2
## 3 0.6000000 0.200000    Fold3
## 4 0.6666667 0.372093    Fold4
## 5 0.9000000 0.800000    Fold5

perform knn model and evaluate the model using 5 cross validation method

# Load necessary library
set.seed(51773)
library(caret)



# Define the control function for train()
ctrl <- trainControl(method = "cv", number = 5, savePredictions = "final")

# Train the model
knn_model <- train(Y ~ ., data = new_Date, method = "knn", trControl = ctrl)

# Print model details
print(knn_model)
## k-Nearest Neighbors 
## 
## 49 samples
## 10 predictors
##  2 classes: 'Good', 'Poor' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 39, 39, 39, 40, 39 
## Resampling results across tuning parameters:
## 
##   k  Accuracy   Kappa    
##   5  0.8577778  0.7160976
##   7  0.8577778  0.7160976
##   9  0.8377778  0.6760976
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 7.
# Check cross-validated results
print(knn_model$resample)
##    Accuracy     Kappa Resample
## 1 0.8000000 0.6000000    Fold1
## 2 0.6000000 0.2000000    Fold3
## 3 1.0000000 1.0000000    Fold2
## 4 1.0000000 1.0000000    Fold5
## 5 0.8888889 0.7804878    Fold4

Simple linear logistic model and random forest accuracy output

mean(predRF != new_Date$Y)
## [1] 0
mean(predLR != new_Date$Y)
## [1] 0.244898

perform random forest model and evaluate the model using 5 cross validation method

set.seed(51773)
# Load necessary libraries
library(caret)
library(randomForest)



# Define the control function for train()
ctrl <- trainControl(method = "cv", number = 5, savePredictions = "final")

# Train the model
rf_model <- train(Y ~ ., data = new_Date, method = "rf", trControl = ctrl)

# Print model details
print(rf_model)
## Random Forest 
## 
## 49 samples
## 10 predictors
##  2 classes: 'Good', 'Poor' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 39, 39, 39, 40, 39 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##    2    0.8355556  0.6742857
##    6    0.8177778  0.6360976
##   10    0.8155556  0.6342857
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
# Check cross-validated results
print(rf_model$resample)
##    Accuracy     Kappa Resample
## 1 0.7000000 0.4000000    Fold1
## 2 1.0000000 1.0000000    Fold2
## 3 1.0000000 1.0000000    Fold5
## 4 0.7777778 0.5714286    Fold4
## 5 0.7000000 0.4000000    Fold3

Conclusion

I conclude that the knn(K=7) approach produce the best result, when using 5 cross validation with the best 10 features.