Exploratory Data Analysis using Data Visualization
A management consultant is studying the roles played by experience and training in a system administrator’s ability to complete a set of tasks in a specified amount of time. In particular , she is interested in discriminating between administrators who are able to complete given tasks within a specified time and those who are not. Data are collected on the performance of 75 randomly selected administrators. They are stored in the file sa.csv.
dataset = read.csv("sa.csv",header = TRUE)
#summary(dataset)
describeBy(dataset,"CompletedTask")
##
## Descriptive statistics by group
## group: No
## vars n mean sd median trimmed mad min max range skew
## Experience 1 60 6.01 1.59 5.85 5.90 1.41 2.7 12.2 9.5 1.01
## Training 2 60 4.50 1.08 4.00 4.25 0.00 4.0 8.0 4.0 2.02
## CompletedTask* 3 60 1.00 0.00 1.00 1.00 0.00 1.0 1.0 0.0 NaN
## kurtosis se
## Experience 2.32 0.20
## Training 3.08 0.14
## CompletedTask* NaN 0.00
## --------------------------------------------------------
## group: Yes
## vars n mean sd median trimmed mad min max range skew
## Experience 1 15 9.95 1.86 9.9 9.88 1.93 7 13.7 6.7 0.27
## Training 2 15 5.07 1.49 4.0 4.92 0.00 4 8.0 4.0 0.87
## CompletedTask* 3 15 2.00 0.00 2.0 2.00 0.00 2 2.0 0.0 NaN
## kurtosis se
## Experience -0.91 0.48
## Training -0.78 0.38
## CompletedTask* NaN 0.00
#Scatter plot
plot1 = ggplot(dataset, aes(x= Experience, y= Training, color= CompletedTask)) +geom_point()
cat(sprintf("\n"))
ggplotly(plot1)
#cat(sprintf("\nFrom the scatterplot, owners class seems to have the #higher average income."))
#cat(sprintf("\n"))
#cat(sprintf("\n"))
#cat(sprintf("\n"))
#Boxplot
plot2 = ggplot(dataset, aes(x= CompletedTask, y= Experience)) +geom_boxplot(fill ="red")
cat(sprintf("\n"))
ggplotly(plot2)
plot3 = ggplot(dataset, aes(x= CompletedTask, y= Training)) + geom_boxplot(fill ="red")
ggplotly(plot3)
#Logistic regression plot
ds = dummy_columns(dataset,select_columns = "CompletedTask")
plot4 =ggplot(ds,aes(Training,CompletedTask_Yes))+
geom_point() + geom_smooth(method ="glm",se = FALSE,method.args=list(family = "binomial"))
ggplotly(plot4)
#Logistic Regression plot
ds = dummy_columns(dataset,select_columns = "CompletedTask")
plot5 = ggplot(ds,aes(Experience,CompletedTask_Yes))+
geom_point() + geom_smooth(method ="glm",se = FALSE,method.args=list(family = "binomial"))
ggplotly(plot5)
rpartmodel = rpart(CompletedTask ~.,data = dataset)
rpart.plot(rpartmodel)
ctreemodel = ctree(CompletedTask ~. ,data = dataset)
plot(ctreemodel)
cfivemodel = C5.0(dataset[,-3],dataset[,3])
plot(cfivemodel)
partimat(CompletedTask ~ .,data = dataset,method="lda")