Exploratory Data Analysis using Data Visualization 
      
      
      

A management consultant is studying the roles played by experience and training in a system administrator’s ability to complete a set of tasks in a specified amount of time. In particular , she is interested in discriminating between administrators who are able to complete given tasks within a specified time and those who are not. Data are collected on the performance of 75 randomly selected administrators. They are stored in the file sa.csv.

dataset = read.csv("sa.csv",header = TRUE)
#summary(dataset)
describeBy(dataset,"CompletedTask")
## 
##  Descriptive statistics by group 
## group: No
##                vars  n mean   sd median trimmed  mad min  max range skew
## Experience        1 60 6.01 1.59   5.85    5.90 1.41 2.7 12.2   9.5 1.01
## Training          2 60 4.50 1.08   4.00    4.25 0.00 4.0  8.0   4.0 2.02
## CompletedTask*    3 60 1.00 0.00   1.00    1.00 0.00 1.0  1.0   0.0  NaN
##                kurtosis   se
## Experience         2.32 0.20
## Training           3.08 0.14
## CompletedTask*      NaN 0.00
## -------------------------------------------------------- 
## group: Yes
##                vars  n mean   sd median trimmed  mad min  max range skew
## Experience        1 15 9.95 1.86    9.9    9.88 1.93   7 13.7   6.7 0.27
## Training          2 15 5.07 1.49    4.0    4.92 0.00   4  8.0   4.0 0.87
## CompletedTask*    3 15 2.00 0.00    2.0    2.00 0.00   2  2.0   0.0  NaN
##                kurtosis   se
## Experience        -0.91 0.48
## Training          -0.78 0.38
## CompletedTask*      NaN 0.00
#Scatter plot 
plot1 = ggplot(dataset, aes(x= Experience, y= Training, color= CompletedTask)) +geom_point()
cat(sprintf("\n"))
ggplotly(plot1)
#cat(sprintf("\nFrom the scatterplot, owners class seems to have the #higher average income."))
#cat(sprintf("\n"))
#cat(sprintf("\n"))
#cat(sprintf("\n"))
#Boxplot
plot2 = ggplot(dataset, aes(x= CompletedTask, y= Experience)) +geom_boxplot(fill ="red")
cat(sprintf("\n"))
ggplotly(plot2)
plot3 = ggplot(dataset, aes(x= CompletedTask, y= Training)) + geom_boxplot(fill ="red")
ggplotly(plot3)
#Logistic regression plot
ds =  dummy_columns(dataset,select_columns = "CompletedTask")
  
plot4 =ggplot(ds,aes(Training,CompletedTask_Yes))+
geom_point() + geom_smooth(method ="glm",se = FALSE,method.args=list(family = "binomial")) 

ggplotly(plot4)
#Logistic Regression plot
ds =  dummy_columns(dataset,select_columns = "CompletedTask")
  
plot5 = ggplot(ds,aes(Experience,CompletedTask_Yes))+
   geom_point() + geom_smooth(method ="glm",se = FALSE,method.args=list(family = "binomial"))

ggplotly(plot5)
rpartmodel = rpart(CompletedTask ~.,data = dataset)
rpart.plot(rpartmodel)

ctreemodel = ctree(CompletedTask ~. ,data = dataset)
plot(ctreemodel)

cfivemodel =  C5.0(dataset[,-3],dataset[,3])
plot(cfivemodel)

partimat(CompletedTask ~ .,data = dataset,method="lda")