# #AP series
# #**Final dataset ap1***
#
# library(psych); library(ggplot2)
# library(caret)
# library(BBmisc)
# install.packages("mice")
# library(VIM)
# library(mice)
# aa4 <- read.csv("~/Documents/X/X/aa4.csv", header=T)
# #1 : AP000 series
# x <- which(grepl("AP00", colnames(aa4))==T)
# ap <- aa4[, x]
# ap <- cbind(ap, MajDo=aa4$MajDo) # only AP
# dim(ap)
#
# ###############################################################################
# #Corr
# tail(colnames(ap))
# CorrVal <- sapply(1:ncol(ap),
# FUN=function(x,y) cor(as.numeric(ap[,x]),y, use="pairwise.complete.obs"),
# y=as.numeric(ap$MajDo))
# cor_ap <- round(CorrVal,2)
#
#
# ###############################################################################
# #NZV
# B <- sapply(1:ncol(ap), function(x) length(which(!is.na(ap[,x]))))
# # nzv_ap <- nearZeroVar(ap, freqCut = 999, uniqueCut = 100*(1/nrow(ap)), saveMetrics = TRUE, allowParallel = TRUE)
# # nzv_ap1 <- nearZeroVar(ap, freqCut = 15, uniqueCut = 100*(1/nrow(ap)), saveMetrics = TRUE, allowParallel = TRUE)
# # nzv_ap2 <- nearZeroVar(ap, freqCut = 15, , uniqueCut = 0.044, saveMetrics = TRUE, allowParallel = TRUE)
# nzv_ap <- nearZeroVar(ap,saveMetrics = TRUE, allowParallel = TRUE)
# head(nzv_ap)
# nzv_ap <- cbind(nzv_ap, nonNA=B, Corr=cor_ap)
# head(nzv_ap)
# nzv_ap$UniqueValue <- nzv_ap$percentUnique * nrow(ap) /100
# head(nzv_ap)
# nzv_ap$FirstValue <- ifelse(nzv_ap$UniqueValue ==2,
# (nzv_ap$freqRatio*nrow(ap) - nzv_ap$freqRatio*(nrow(ap) - nzv_ap$nonNA)) / (1+ nzv_ap$freqRatio),NA)
# nzv_ap$SecondValue <- ifelse(nzv_ap$UniqueValue ==2, (nzv_ap$nonNA-nzv_ap$FirstValue),NA)
# head(nzv_ap)
# dim(nzv_ap)
# #**Final dataset ap***
# dim(ap)
# # ap1 <- ap %>% BBmisc::dropNamed("MajDo")
# # dim(ap1)
# # non AP variable to delete
# # write.csv(ap1, file="ap1.csv", row.names=F)
#
# dim(ap)
#
# aggr_plot <- aggr(ap, col=c('navyblue','red'), numbers=TRUE, sortVars=TRUE,
# labels=names(ap), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
#
# ###############################################################################
# #Combining Siebel
# sbl <- read.csv("~/Documents/ACS/Acxion2/sbl2.csv", header=T)
# # 3 AP + Siebel
# ap2 <- cbind(ap, sbl) # AP + Siebel
# head(sapply(ap2, class)); tail(sapply(ap2, class))
# ###############################################################################
# #dimension Reduction
# which(sapply(ap2, class)=="factor")
# ap2$Const_Status <- as.numeric(ap2$Const_Status) #...1
# ap2$Const_Recog_Surv_Flag <- as.numeric(ap2$Const_Recog_Surv_Flag) #...2
# ap2$RelationshipToCancer_One_Value <- as.numeric(ap2$RelationshipToCancer_One_Value) #...3
# names(ap2)
# ap2 <- ap2[, c(1:51, 53:60,52)]
# table(ap2$MajDo)
# names(ap2)
# dim(ap2)
# #ap2X <- na.omit(ap2)
# #dim(ap2); dim(ap2X)
# #can we impute
# ############################ Imputation for #2 list #####################
# sapply(ap2, class)
# #write.csv(ap2, file="ap2X.csv")
# #IMPUTATION WITH MICE
# ap2mice <- mice(ap2,m=1,maxit=1,meth='pmm',seed=500)
# summary(ap2mice)
# ap2mice1 <- complete(ap2mice) #...part 1
# dim(ap2mice1)
#
# # write.csv(ap2mice1, file="ap2mice1.csv", row.names=F)
#
#
#
# #Counting number of NA by column
# NuOfNA <- function(file,x) {
# length(which(is.na(file[ ,x]==TRUE)))
# }
# # save(NuOfNA, file="NuOfNA.Rdata")
#
# Pct_NA <- sapply(1:ncol(ap2mice1), function(Y) NuOfNA(ap2mice1,Y)/nrow(ap2mice1))
#
#
#
# #replace columns with imputed data
# impute_colna <- colnames(ap2mice1)
# impute_fn <- function(d) modifyList(d, sapply(1:length(impute_colna),
# function(i) d[,impute_colna[i]] <- ap2mice1[i]))
# ap3 <- impute_fn(ap2)
# sapply(ap3, class)
#
# names(ap2mice1)
# names(ap2)
#
# ap3 <- ap2mice1
#
# #Change back to Integers
# which(sapply(ap3, class)=="factor")
#
# # asNumeric <- function(x) as.numeric(as.character(x))
# # asNumeric <- function(x) as.numeric(x)
# # factorsNumeric <- function(d) modifyList(d, lapply(d[, sapply(d, is.factor)], asNumeric))
# # ap4 <- factorsNumeric(ap3)
# # sapply(ap4, class)
#
# #balance MajDo and Non-MajDo
# set.seed(111); Samp <- sample(1:3000, 1630)
# ap3x <- subset(ap3, MajDo ==0, )
# dim(ap3x)
# ap3x <- ap3x[Samp,]
# dim(ap3x)
# ap3y <- rbind(ap3x, subset(ap3, MajDo==1,))
# dim(ap3y)
#
# #try PLS or PCA
# KMO(ap3y[, 1:59])
# rangeStd <- function(x) {(x-min(x))/(max(x)-min(x))}
# stdData <- apply(subset(ap3y, , 1:59),2, rangeStd)
# pc <- principal(stdData, nfactors=6, rotate='varimax')
# par(mfrow=c(1,1))
# plot(pc$values[1:6], type="b", main="Scree Plot")
# #each of these numbers amount of vari encapsulated in each PC of this dataset
# pc$weights
# head(pc$scores) # this is a new dataset with 4 PC as x1,x2,x3,x4
# pc$loadings #Important: this is a correlation of original variable with PC
#
# ###############################################################################
# # Clustering
# cl <- kmeans(stdData, 2)
# stdData1 <- cbind(stdData, CLUSTER=as.factor(cl$cluster), MJ = ap3y$MajDo)
# colnames(stdData1)
# #plot
# ggplot(as.data.frame(pc$scores), aes(x=RC1, y=RC2)) +
# #geom_point(aes(color=as.factor(stdData1[,61]), size=as.factor(stdData1[,60]))) +
#
# geom_point(aes(color=as.factor(stdData1[,61]), shape=as.factor(stdData1[,60]), size=5),
# alpha=0.9, position= "jitter") +
# #scale_size_continuous(range=c(1,2)) +
# theme_bw() +
# ggtitle('First 2 prinipal components clustering w/ kmeans') +
# xlab("Principal Component 1") + ylab("Principal Component 2")
#
# ap4 <- cbind(ap3y, CLUSTER=as.factor(cl$cluster))
# table(ap4$MajDo, ap4$CLUSTER)
# # ggplot(as.data.frame(pc$scores), aes(x=RC1, y=RC2, color=as.factor(stdData1[,59]))) +
# # geom_point(size=5) + theme_bw() +
# # ggtitle('First 2 prinipal components clustering w/ kmeans') +
# # xlab("Principal Component 1") + ylab("Principal Component 2")
#
# #ap2X <- ap2X %>% BBmisc::dropNamed("CLUSTER")
#
# cl <- kmeans(stdData, 2)
# stdData1 <- cbind(stdData, CLUSTER=as.factor(cl$cluster))
# ggplot(as.data.frame(pc$scores), aes(x=RC1, y=RC2, color=as.factor(stdData1[,59]))) +
# geom_point(size=5) + theme_bw() +
# ggtitle('First 2 prinipal components clustering w/ kmeans') +
# xlab("Principal Component 1") + ylab("Principal Component 2")
#
# ####################################################################################
# # MODELING
# #ap2X dataset
# #GBM, RF, PLS, Logistic Modeling
# setwd("~/Documents/ACS/Acxion2")
# ap2X <- read.csv("~/Documents/ACS/Acxion2/ap2X.csv", header=T)
# # library(gbm)
# # gbmModel <- gbm(MajDo ~ ., data=tr, #aa4
# # distribution="bernoulli",
# # interaction.depth=5, #10
# # n.trees=30, # 200
# # shrinkage=0.01, #0.01
# # verbose=T) #F
#
# names(ap2X)
# ap2X$MajDo <- as.factor(ap2X$MajDo)
# set.seed(10001)
# inTrain <- createDataPartition(ap2X$MajDo, p=.8, list=F)
# tr <- ap2X[inTrain,]
# ts <- ap2X[-inTrain, ]
# table(tr$MajDo); table(ts$MajDo)
# class(tr$MajDo); class(ts$MajDo)
# tr$MajDo <- as.factor(tr$MajDo)
# ts$MajDo <- as.factor(ts$MajDo)
#
# ############ GBM ##############################################
#
# controlObject <- trainControl(method="cv", number=10) #...1
# gbmGrid <- expand.grid(n.trees=seq(1,300, by =50),
# interaction.depth= seq(1,6, by=2),
# shrinkage=c(0.1,0.01), n.minobsinnode=20) #...2
#
# gbmModel1 <- train(MajDo ~ ., data=tr,
# method="gbm",
# tuneGrid=gbmGrid,
# verbose=F,
# trControl=controlObject)
# #results
# confusionMatrix(gbmModel1) #...results
# gbmModel1 #...results
# plot(gbmModel1) #...results
# summary(gbmModel1, 6) # #...results #Top 6 Variables
#
# gbmModel1$results[which.max(gbmModel1$result[,5]),] # best result
#
# #Prediction
# pred2 <- predict(gbmModel1, ts[,-59], type="raw")
# m <- table(pred2,ts$MajDo)
# m
# (m[1]+m[4])/(m[1]+m[2]+m[3]+m[4])
#
# ############## GBM ########################################
# gbmGrid <- expand.grid(n.trees=(1:30)*20,
# interaction.depth= seq(1,12, by=3),
# shrinkage=c(0.1,0.1), n.minobsinnode=20)
# fitControl <- trainControl(method="repeatedcv", number=10, repeats=10) # 1
# gbmGrid <- expand.grid(interaction.depth=c(1,5,9), n.trees=(1:30)*20, shrinkage=.1)
# gbmFit2 <- train(MajDo ~., data=tr1,
# method="gbm",
# trControl = fitControl,
# verbose=F, tuneGrid = gbmGrid)
# summary(gbmFit2, 6) # Top 6 Variables
# confusionMatrix(gbmFit2) # *** shows results in percentage
#
# ################# GBM ##################################################
# fitControl <- trainControl(method="repeatedcv", number=10, repeats=10) # 1
# set.seed(888)
# gbmFit1 <- train(Class~., data=training, method="gbm", trControl = fitControl, verbose=F
# gbmFit1$results[which.max(gbmFit1$result[,4]),] # best result
# summary(gbmFit1, 6) # Top 6 Variables
# #tripling the n.tree from 150 to 450 for demo purposes
# gbmGrid <- expand.grid(interaction.depth=c(1,5,9), n.trees=(1:30)*20, shrinkage=.1)
# # model1 & 2 side by side
# trellis.par.set(caretTheme())
# par(mfrow=c(1,2))
# plot(gbmModel1)
# plot(gbmFit2)
# plot(gbmFit3)
# plot(gbmFit2, metric = "Kappa", plotType = "level",scales = list(x = list(rot = 90))) #KAPPA
#
# ################# GBM ##################################################
# set.seed=777
# gbmGrid <- expand.grid(interaction.depth=c(1,5,9), n.trees=(1:30)*20, shrinkage=.1, n.minobsinnode=20)
# fitControl=trainControl(method="repeatedcv", number=10,repeats=10, classProbs = T, summaryFunction = twoClassSummary) #2
# gbmFit3 <- train(MajDo~., data=tr, method="gbm", trControl = fitControl, verbose=F, tuneGrid = gbmGrid, metric="ROC")
# gbmFit3$results[which.max(gbmFit3$result[,5]),] # best result
#
# summary(gbmFit3, 6) # Top 6 Variables
# which2Pct <- tolerance(gbmFit3$results, metric="ROC", tol=2, maximize=TRUE)
# gbmFit3$results[which2Pct, 1:7]
#
# pred3 <- predict(gbmFit3, newdata=ts[,-59], type="raw")
# m <- table(pred3,ts$MajDo)
# m
# (m[1]+m[4])/(m[1]+m[2]+m[3]+m[4])
# confusionMatrix(gbmFit3) # *** shows results in percentage
#
# ################## RANDOM FOREST ######################################################
# ctrl <- trainControl(method = "CV",
# number=10,
# classProbs = TRUE,
# allowParallel = TRUE,
# summaryFunction = twoClassSummary)
#
# set.seed(476)
# rfFit <- train(MajDo ~. ,
# data=tr,
# method = "rf",
# tuneGrid = expand.grid(.mtry = seq(1,50,by=5)),
# ntrees=300,
# importance = TRUE,
# metric = "ROC",
# trControl = ctrl)
# plot(rfFit)
# ts1 <- na.omit(ts)
# pred <- predict(rfFit, newdata = ts1[,-60], type = "raw")
# m <- table(pred,ts1$MajDo)
# m
# (m[1]+m[4])/(m[1]+m[2]+m[3]+m[4])
# varImp(rfFit)
# x <- (varImp(rfFit))
# xx <- do.call("cbind", x)
#
# row.names(xx)
# ########### LOGISTIC#######################################################
#
# mod1 <- glm(MajDo ~ ., data=tr, family=binomial)
# pred <- predict(mod1, newdata=ts[,-59], type="response")
# head(round(pred,2),20)
# pred2 <- ifelse(predict(mod1, newdata= ts[,-59], type="response")>.5, 1, 0)
# m <- table(pred2,ts$MajDo)
# m
# (m[1]+m[4])/(m[1]+m[2]+m[3]+m[4])
#
# ############ PLS ######################################################
#
# plsFit <- train(MajDo ~., data=tr, method="pls", preProc=c("center", "scale"))
#
# plsFit1 <- train(MajDo~., data=tr, method = "pls",
# tuneLength=15, metric="ROC",
# trControl=(trainControl(method="repeatedcv", repeats=3,
# classProbs=TRUE, summaryFunction=twoClassSummary)),
# preProc=c("center","scale"))
# plsFit1
# plot(plsFit1)
# plsClasses <- predict(plsFit1, newdata=ts[,-59])
# str(plsClasses)
# confusionMatrix(data=plsClasses, ts$MajDo)
# m <- table(plsClasses,ts$MajDo)
# m
# (m[1]+m[4])/(m[1]+m[2]+m[3]+m[4])
#
#
# ########### SVM #################################################
#