This is an example of data analysis and visualization using the R programming language. My goal is to give a glimpse of what can be done outside of excel. I do not expect you to understand any of the code below, but you can read my notes to see the purpose of each code chunk. You can also try to run the code by following the Step-by-Step Instructions below.
## Load packages
suppressMessages(library(caret))
library(AppliedPredictiveModeling)
## Load Data
set.seed(3433)
data(AlzheimerDisease)
adData = data.frame(diagnosis,predictors)
#set up machine learning components
inTrain = createDataPartition(adData$diagnosis, p = 3/4)[[1]]
training = adData[ inTrain,]
testing = adData[-inTrain,]
## Find all the predictor variables in the training set that begin with IL.
predName <- names(training)
(ILpredictor <- predName[substr(predName, 1, 2) == "IL"])
## [1] "IL_11" "IL_13" "IL_16" "IL_17E"
## [5] "IL_1alpha" "IL_3" "IL_4" "IL_5"
## [9] "IL_6" "IL_6_Receptor" "IL_7" "IL_8"
## Perform principal components on these variables with the preProcess() function from the caret package.
## Calculate the number of principal components needed to capture 90% of the variance.
ProcPCA <- preProcess(training[, ILpredictor], method = "pca", thresh = .9)
ProcPCA$numComp
## [1] 9
#Plot interleukin values by diagnosis, i.e. between patients ("impaired") and normal individuals ("control")
library(ggplot2)
il1a<-ggplot(data=adData, aes(x=diagnosis, adData$IL_1alpha)) + geom_violin() + aes(colour=diagnosis) + geom_point() + theme(legend.position="none") + ylab("IL-1a")
il3<-ggplot(data=adData, aes(x=diagnosis, adData$IL_3)) + geom_violin() + aes(colour=diagnosis) + geom_point() + theme(legend.position="none") + ylab("IL-3")
il4<-ggplot(data=adData, aes(x=diagnosis, adData$IL_4)) + geom_violin() + aes(colour=diagnosis) + geom_point() + theme(legend.position="none") + ylab("IL-4")
il5<-ggplot(data=adData, aes(x=diagnosis, adData$IL_5)) + geom_violin() + aes(colour=diagnosis) + geom_point() + theme(legend.position="none") + ylab("IL-5")
il6<-ggplot(data=adData, aes(x=diagnosis, adData$IL_6)) + geom_violin() + aes(colour=diagnosis) + geom_point() + theme(legend.position="none") + ylab("IL-6")
il7<-ggplot(data=adData, aes(x=diagnosis, adData$IL_7)) + geom_violin() + aes(colour=diagnosis) + geom_point() + theme(legend.position="none") + ylab("IL-7")
il8<-ggplot(data=adData, aes(x=diagnosis, adData$IL_8)) + geom_violin() + aes(colour=diagnosis) + geom_point() + theme(legend.position="none") + ylab("IL-8")
il11<-ggplot(data=adData, aes(x=diagnosis, adData$IL_11)) + geom_violin() + aes(colour=diagnosis) + geom_point() + theme(legend.position="none") + ylab("IL-11")
il13<-ggplot(data=adData, aes(x=diagnosis, adData$IL_13)) + geom_violin() + aes(colour=diagnosis) + geom_point() + theme(legend.position="none") + ylab("IL-13")
il16<-ggplot(data=adData, aes(x=diagnosis, adData$IL_16)) + geom_violin() + aes(colour=diagnosis) + geom_point() + theme(legend.position="none") + ylab("IL-16")
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)
# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)
numPlots = length(plots)
# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}
if (numPlots==1) {
print(plots[[1]])
} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}
multiplot(il1a, il3, il4, il5, il6, il7, il8, il11, il13, il16, cols=5)