SampleCode

#         #AP series
#         #**Final dataset ap1***
#         
#         library(psych);     library(ggplot2)
#         library(caret)
#         library(BBmisc)
#         install.packages("mice")
#         library(VIM)
#         library(mice)
#         aa4 <- read.csv("~/Documents/X/X/aa4.csv", header=T)
#         #1 : AP000 series
#         x <- which(grepl("AP00", colnames(aa4))==T)
#         ap <- aa4[, x]
#         ap <- cbind(ap, MajDo=aa4$MajDo) # only AP
#         dim(ap)
#         
#         ###############################################################################
#         #Corr
#         tail(colnames(ap))
#         CorrVal <- sapply(1:ncol(ap),
#                           FUN=function(x,y) cor(as.numeric(ap[,x]),y, use="pairwise.complete.obs"), 
#                           y=as.numeric(ap$MajDo))
#         cor_ap <- round(CorrVal,2)
#         
#         
#         ###############################################################################
#         #NZV
#         B <- sapply(1:ncol(ap), function(x) length(which(!is.na(ap[,x]))))
#         #           nzv_ap <- nearZeroVar(ap, freqCut = 999, uniqueCut = 100*(1/nrow(ap)), saveMetrics = TRUE, allowParallel = TRUE)
#         #            nzv_ap1 <- nearZeroVar(ap, freqCut = 15, uniqueCut = 100*(1/nrow(ap)), saveMetrics = TRUE, allowParallel = TRUE)
#         #            nzv_ap2 <- nearZeroVar(ap, freqCut = 15, , uniqueCut = 0.044, saveMetrics = TRUE, allowParallel = TRUE)
#         nzv_ap <- nearZeroVar(ap,saveMetrics = TRUE, allowParallel = TRUE)
#         head(nzv_ap) 
#         nzv_ap <- cbind(nzv_ap, nonNA=B, Corr=cor_ap)            
#         head(nzv_ap)
#         nzv_ap$UniqueValue <- nzv_ap$percentUnique * nrow(ap) /100
#         head(nzv_ap)    
#         nzv_ap$FirstValue <- ifelse(nzv_ap$UniqueValue ==2,
#                                     (nzv_ap$freqRatio*nrow(ap) - nzv_ap$freqRatio*(nrow(ap) - nzv_ap$nonNA)) / (1+ nzv_ap$freqRatio),NA)
#         nzv_ap$SecondValue <- ifelse(nzv_ap$UniqueValue ==2, (nzv_ap$nonNA-nzv_ap$FirstValue),NA)
#         head(nzv_ap)
#         dim(nzv_ap)
#         #**Final dataset ap***
#         dim(ap)
#             #   ap1 <- ap %>% BBmisc::dropNamed("MajDo")
#             #   dim(ap1)
#             #   non AP variable to delete
#         #       write.csv(ap1, file="ap1.csv", row.names=F)
#         
#         dim(ap)
#         
#         aggr_plot <- aggr(ap, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE, 
#                           labels=names(ap), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
#         
#         ###############################################################################
#         #Combining Siebel
#         sbl <- read.csv("~/Documents/ACS/Acxion2/sbl2.csv", header=T)
#         # 3 AP + Siebel
#         ap2 <- cbind(ap, sbl) # AP + Siebel
#         head(sapply(ap2, class)); tail(sapply(ap2, class))
#         ###############################################################################
#         #dimension Reduction
#         which(sapply(ap2, class)=="factor")
#         ap2$Const_Status <- as.numeric(ap2$Const_Status) #...1
#         ap2$Const_Recog_Surv_Flag <- as.numeric(ap2$Const_Recog_Surv_Flag) #...2
#         ap2$RelationshipToCancer_One_Value <- as.numeric(ap2$RelationshipToCancer_One_Value) #...3
#         names(ap2)
#         ap2 <- ap2[, c(1:51, 53:60,52)]
#         table(ap2$MajDo)
#         names(ap2)
#         dim(ap2)
#         #ap2X <- na.omit(ap2)    
#         #dim(ap2); dim(ap2X)
#                         #can we impute
#                         ############################ Imputation for #2 list #####################
#         sapply(ap2, class)
#         #write.csv(ap2, file="ap2X.csv")
#         #IMPUTATION WITH MICE
#                         ap2mice <- mice(ap2,m=1,maxit=1,meth='pmm',seed=500)
#                         summary(ap2mice)
#                         ap2mice1 <- complete(ap2mice) #...part 1
#         dim(ap2mice1)
#         
#         # write.csv(ap2mice1, file="ap2mice1.csv", row.names=F)
#         
#                         
#         
#                         #Counting number of NA by column
#                         NuOfNA <- function(file,x) {
#                             length(which(is.na(file[ ,x]==TRUE)))
#                         }
#                         #   save(NuOfNA, file="NuOfNA.Rdata")
#                         
#                         Pct_NA <- sapply(1:ncol(ap2mice1), function(Y) NuOfNA(ap2mice1,Y)/nrow(ap2mice1))
#         
#         
#         
#                         #replace columns with imputed data
#                         impute_colna <- colnames(ap2mice1)
#                         impute_fn <- function(d) modifyList(d, sapply(1:length(impute_colna), 
#                                                   function(i) d[,impute_colna[i]] <- ap2mice1[i]))
#                         ap3 <- impute_fn(ap2)
#                     sapply(ap3, class)
#         
#         names(ap2mice1)
#         names(ap2)
#         
#         ap3 <- ap2mice1
#         
#         #Change back to Integers
#         which(sapply(ap3, class)=="factor")
#         
#         #             asNumeric <- function(x) as.numeric(as.character(x))
#         #             asNumeric <- function(x) as.numeric(x)
#         #             factorsNumeric <- function(d) modifyList(d, lapply(d[, sapply(d, is.factor)], asNumeric))
#         #             ap4 <- factorsNumeric(ap3)
#         #             sapply(ap4, class)
#         
#         #balance MajDo and Non-MajDo
#         set.seed(111); Samp <- sample(1:3000, 1630)
#         ap3x <- subset(ap3, MajDo ==0, )
#         dim(ap3x)
#         ap3x <- ap3x[Samp,]
#         dim(ap3x)
#         ap3y <- rbind(ap3x, subset(ap3, MajDo==1,))    
#         dim(ap3y)
#         
#         #try PLS or PCA
#             KMO(ap3y[, 1:59])
#             rangeStd <- function(x) {(x-min(x))/(max(x)-min(x))}
#             stdData <- apply(subset(ap3y, , 1:59),2, rangeStd)
#             pc <- principal(stdData, nfactors=6, rotate='varimax')
#             par(mfrow=c(1,1))
#             plot(pc$values[1:6], type="b", main="Scree Plot")
#             #each of these numbers amount of vari encapsulated in each PC of this dataset
#             pc$weights
#             head(pc$scores) # this is a new dataset with 4 PC as x1,x2,x3,x4
#             pc$loadings #Important: this is a correlation of original variable with PC
#         
#         ###############################################################################
#         # Clustering
#         cl <- kmeans(stdData, 2)
#         stdData1 <- cbind(stdData, CLUSTER=as.factor(cl$cluster), MJ = ap3y$MajDo)
#         colnames(stdData1)
#         #plot
#         ggplot(as.data.frame(pc$scores), aes(x=RC1, y=RC2)) +
#             #geom_point(aes(color=as.factor(stdData1[,61]),  size=as.factor(stdData1[,60]))) + 
#         
#             geom_point(aes(color=as.factor(stdData1[,61]), shape=as.factor(stdData1[,60]), size=5), 
#                        alpha=0.9, position= "jitter") +
#             #scale_size_continuous(range=c(1,2)) + 
#             theme_bw() + 
#             ggtitle('First 2 prinipal components clustering w/ kmeans') + 
#             xlab("Principal Component 1") + ylab("Principal Component 2")
#         
#         ap4 <- cbind(ap3y, CLUSTER=as.factor(cl$cluster))
#         table(ap4$MajDo, ap4$CLUSTER)
#         # ggplot(as.data.frame(pc$scores), aes(x=RC1, y=RC2, color=as.factor(stdData1[,59]))) + 
#         #     geom_point(size=5) + theme_bw() + 
#         #     ggtitle('First 2 prinipal components clustering w/ kmeans') + 
#         #     xlab("Principal Component 1") + ylab("Principal Component 2")
#         
#         #ap2X <- ap2X %>% BBmisc::dropNamed("CLUSTER")
#         
#         cl <- kmeans(stdData, 2)
#         stdData1 <- cbind(stdData, CLUSTER=as.factor(cl$cluster))    
#         ggplot(as.data.frame(pc$scores), aes(x=RC1, y=RC2, color=as.factor(stdData1[,59]))) + 
#             geom_point(size=5) + theme_bw() + 
#             ggtitle('First 2 prinipal components clustering w/ kmeans') + 
#             xlab("Principal Component 1") + ylab("Principal Component 2")
#         
#         ####################################################################################
#         # MODELING 
#         #ap2X dataset
#         #GBM, RF, PLS, Logistic Modeling
#         setwd("~/Documents/ACS/Acxion2")
#         ap2X <- read.csv("~/Documents/ACS/Acxion2/ap2X.csv", header=T)
#                         # library(gbm)
#                         # gbmModel <- gbm(MajDo ~ ., data=tr, #aa4
#                         #                 distribution="bernoulli",
#                         #                 interaction.depth=5, #10
#                         #                 n.trees=30, # 200
#                         #                 shrinkage=0.01, #0.01
#                         #                 verbose=T) #F
#                         
#                         names(ap2X)
#                         ap2X$MajDo <- as.factor(ap2X$MajDo)
#                         set.seed(10001)
#                         inTrain <- createDataPartition(ap2X$MajDo, p=.8, list=F)
#                         tr <- ap2X[inTrain,]
#                         ts <- ap2X[-inTrain, ]
#                         table(tr$MajDo); table(ts$MajDo)
#                         class(tr$MajDo); class(ts$MajDo)
#         tr$MajDo <- as.factor(tr$MajDo)
#         ts$MajDo <- as.factor(ts$MajDo)
#         
#         ############ GBM ##############################################
#         
#                         controlObject <- trainControl(method="cv", number=10) #...1
#                         gbmGrid <- expand.grid(n.trees=seq(1,300, by =50),
#                                                interaction.depth= seq(1,6, by=2),
#                                                shrinkage=c(0.1,0.01), n.minobsinnode=20) #...2
#                         
#                         gbmModel1 <- train(MajDo ~ ., data=tr,
#                                            method="gbm",
#                                            tuneGrid=gbmGrid,
#                                            verbose=F,
#                                            trControl=controlObject)
#                         #results
#                         confusionMatrix(gbmModel1) #...results
#                         gbmModel1 #...results
#                         plot(gbmModel1) #...results
#                         summary(gbmModel1, 6) # #...results #Top 6 Variables 
#                         
#                         gbmModel1$results[which.max(gbmModel1$result[,5]),] # best result
#                         
#                         #Prediction
#                         pred2 <- predict(gbmModel1, ts[,-59], type="raw")
#                         m <- table(pred2,ts$MajDo)
#                         m
#                         (m[1]+m[4])/(m[1]+m[2]+m[3]+m[4])
#         
#         ############## GBM ########################################
#                         gbmGrid <- expand.grid(n.trees=(1:30)*20,
#                                                interaction.depth= seq(1,12, by=3),
#                                                shrinkage=c(0.1,0.1), n.minobsinnode=20)
#                         fitControl <- trainControl(method="repeatedcv", number=10, repeats=10) # 1
#                         gbmGrid <- expand.grid(interaction.depth=c(1,5,9), n.trees=(1:30)*20, shrinkage=.1)
#                         gbmFit2 <- train(MajDo ~., data=tr1, 
#                                          method="gbm", 
#                                          trControl = fitControl, 
#                                          verbose=F, tuneGrid = gbmGrid)
#                         summary(gbmFit2, 6) # Top 6 Variables 
#                         confusionMatrix(gbmFit2) # *** shows results in percentage
#         
#         ################# GBM ##################################################
#         fitControl <- trainControl(method="repeatedcv", number=10, repeats=10) # 1
#         set.seed(888)
#         gbmFit1 <- train(Class~., data=training, method="gbm", trControl = fitControl, verbose=F
#                          gbmFit1$results[which.max(gbmFit1$result[,4]),] # best result
#                          summary(gbmFit1, 6) # Top 6 Variables
#                          #tripling the n.tree from 150 to 450 for demo purposes
#                          gbmGrid <- expand.grid(interaction.depth=c(1,5,9), n.trees=(1:30)*20, shrinkage=.1)
#                          # model1 & 2 side by side
#                          trellis.par.set(caretTheme())
#                          par(mfrow=c(1,2))
#                          plot(gbmModel1)
#                          plot(gbmFit2) 
#                          plot(gbmFit3)
#                          plot(gbmFit2, metric = "Kappa", plotType = "level",scales = list(x = list(rot = 90))) #KAPPA
#                          
#                          ################# GBM ##################################################
#                          set.seed=777
#                          gbmGrid <- expand.grid(interaction.depth=c(1,5,9), n.trees=(1:30)*20, shrinkage=.1, n.minobsinnode=20)
#                          fitControl=trainControl(method="repeatedcv", number=10,repeats=10, classProbs = T, summaryFunction = twoClassSummary) #2
#                          gbmFit3 <- train(MajDo~., data=tr, method="gbm", trControl = fitControl, verbose=F, tuneGrid = gbmGrid, metric="ROC")
#                          gbmFit3$results[which.max(gbmFit3$result[,5]),] # best result
#                          
#                          summary(gbmFit3, 6) # Top 6 Variables
#                          which2Pct <- tolerance(gbmFit3$results, metric="ROC", tol=2, maximize=TRUE)
#                          gbmFit3$results[which2Pct, 1:7]
#                         
#                          pred3 <- predict(gbmFit3, newdata=ts[,-59], type="raw")
#                          m <- table(pred3,ts$MajDo)
#                          m
#                          (m[1]+m[4])/(m[1]+m[2]+m[3]+m[4])
#                          confusionMatrix(gbmFit3) # *** shows results in percentage
#                          
#                          ################## RANDOM FOREST ######################################################
#                          ctrl <- trainControl(method = "CV",
#                                               number=10,
#                                               classProbs = TRUE,
#                                               allowParallel = TRUE,
#                                               summaryFunction = twoClassSummary)
#                          
#                          set.seed(476)
#                          rfFit <- train(MajDo ~. ,
#                                         data=tr,
#                                         method = "rf",
#                                         tuneGrid = expand.grid(.mtry = seq(1,50,by=5)),
#                                         ntrees=300,
#                                         importance = TRUE,
#                                         metric = "ROC",
#                                         trControl = ctrl)
#                          plot(rfFit)
#                          ts1 <- na.omit(ts)
#                          pred <- predict(rfFit, newdata = ts1[,-60], type = "raw") 
#                          m <- table(pred,ts1$MajDo)
#                          m
#                          (m[1]+m[4])/(m[1]+m[2]+m[3]+m[4])
#                          varImp(rfFit)
#                          x <- (varImp(rfFit))
#                          xx <- do.call("cbind", x)
#                          
#                          row.names(xx)
#                          ########### LOGISTIC#######################################################
#                          
#                          mod1 <- glm(MajDo ~ ., data=tr, family=binomial)
#                          pred <- predict(mod1, newdata=ts[,-59], type="response")
#                          head(round(pred,2),20)
#                          pred2 <- ifelse(predict(mod1, newdata= ts[,-59], type="response")>.5, 1, 0)
#                          m <- table(pred2,ts$MajDo)
#                          m
#                          (m[1]+m[4])/(m[1]+m[2]+m[3]+m[4])
#                          
#                          ############ PLS ######################################################
#                          
#                          plsFit <- train(MajDo ~., data=tr, method="pls", preProc=c("center", "scale"))
#                          
#                          plsFit1 <- train(MajDo~., data=tr, method = "pls", 
#                                           tuneLength=15, metric="ROC", 
#                                           trControl=(trainControl(method="repeatedcv", repeats=3, 
#                                                                   classProbs=TRUE, summaryFunction=twoClassSummary)),  
#                                           preProc=c("center","scale"))
#                          plsFit1
#                          plot(plsFit1)
#                          plsClasses <- predict(plsFit1, newdata=ts[,-59])
#                          str(plsClasses)
#                          confusionMatrix(data=plsClasses, ts$MajDo)
#                          m <- table(plsClasses,ts$MajDo)
#                          m
#                          (m[1]+m[4])/(m[1]+m[2]+m[3]+m[4])
#                          
#                          
#                          ########### SVM #################################################
#