---
title: "data visualization data mining "
subtitle: "using ggplot2 & caret R package "
author: "kirit ved"
date: today
format:
html:
embed-resources: true
standalone: true
code-tools: true
code-block-bg: true
code-fold: true
code-copy: false
theme: default
toc: true
table-of-contents: true
toc-location: body
toc-title: "toc"
toc-expand: true
code-link: true
fontcolor: "#BD9A7A"
fontsize: "15"
# backgroundcolor: "#ADDBE6"
#title-block-banner-color: "#121234"
title-block-banner: true
title-block-style: default
editor: visual
---
## ggplot2
### setting R environment
```{r}
#| warning: true
#| message: true
#| echo: false
rm(list=ls(all.names = T))
set.seed(1)
if(! require(pacman)) install.packages("pacman")
p_load(pacman,tidyverse,janitor,kbv,caret,kernlab,nnet,NeuralNetTools)
p_loaded()
myhist=function(df,cn,cf){
colnm=colnames(df)
ttl=paste("histogram of ",colnm[cn]," v/s ",colnm[cf],sep="")
v1=df[,cn]#;print(v1)
v2=df[,cf]#;print(v2)
g=df|>
ggplot(aes(x=v1,fill=v2))+
geom_histogram(bins = 40,color="black")+labs(title=ttl)+theme_dark()+xlab(colnm[cn])+kbv_tmpx()+
theme(plot.title = element_text(hjust=.5,color="brown",size=15))
print(g)
return(g)
}
mydensity=function(df,cn,cf){
colnm=colnames(df)
ttl=paste("density plot of ",colnm[cn]," v/s ",colnm[cf],sep="")
v1=df[,cn]#;print(v1)
v2=df[,cf]#;print(v2)
g=df|>
ggplot(aes(x=v1,fill=v2))+
geom_density(color="black")+labs(title=ttl)+theme_dark()+xlab(colnm[cn])+kbv_tmpx()+
theme(plot.title = element_text(hjust=.5,color="brown",size=15))
print(g)
return(g)
}
mysp=function(df,cn1,cn2){
colnm=colnames(df)
ttl=paste("scattered plot of ",colnm[cn1]," v/s ",colnm[cn2],sep="")
v1=df[,cn1]#;print(v1)
v2=df[,cn2]#;print(v2)
g=df|>
ggplot(aes(x=v1,y=v2))+
geom_point(color="red")+labs(title=ttl)+geom_line(color="green")+theme_dark()+kbv_tmpx()+
theme(plot.title = element_text(hjust=.5,color="brown",size=15))+xlab(colnm[cn1])+ylab(colnm[cn2])
print(g)
return(g)
}
mybp=function(df,cn,cf){
colnm=colnames(df)
ttl=paste("boxplot of ",colnm[cn]," v/s ",colnm[cf],sep="")
v1=df[,cn]#;print(v1)
v2=df[,cf]#;print(v2)
g=df|>
ggplot(aes(x=v1,y=v2,fill=v2))+
geom_boxplot(color="black")+labs(title=ttl)+
theme_dark()+coord_flip()+kbv_tmpx()
theme(plot.title = element_text(hjust=.5,color="brown",size=15),
plot.background = element_rect(color="lightblue"))+
ylab(colnm[cn])+
xlab("median")
print(g)
return(g)
}
```
### loading iris dataset & viewing it
```{r}
#| warning: false
#| message: false
d=iris |> janitor::clean_names()
head(d)
tail(d)
str(d)
summary(d)
```
### viewing histogram
```{r}
#| warning: false
#| message: false
#| echo: false
for(i in 1:4){
myhist(d,i,5)
}
```
### viewing density plot
```{r}
#| warning: false
#| message: false
#| echo: false
for(i in 1:4){
mydensity(d,i,5)
}
```
### viewing scattered plots
```{r}
#| warning: false
#| message: false
#| echo: false
for(i in 1:4){
for(j in 1:4){
if(i>=j){
} else{
mysp(d,i,j)
}
}
}
```
### viewing box plots
```{r}
#| warning: false
#| message: false
#| echo: false
for(i in 1:4){
mybp(d,i,5)
}
```
### viewing bar plots
```{r}
#| warning: false
#| message: false
#| echo: false
d1=d|>
group_by(species)|>
summarise(cnt=n(),m=mean(sepal_length),s=sd(sepal_length))|>mutate(cnt=round(cnt,2),m=round(m,2),s=round(s,2))
d1
d1|>
ggplot(aes(species,cnt,fill=species,label=cnt))+
geom_col()+kbv_tmpx()+
geom_text(aes(label=cnt),vjust=5)
d1|>
ggplot(aes(species,m,fill=species,label=cnt))+
geom_col()+kbv_tmpx()+
geom_text(aes(label=m),vjust=5)
d1|>
ggplot(aes(species,s,fill=species,label=cnt))+
geom_col()+kbv_tmpx()+
geom_text(aes(label=s),vjust=5)
```
## caret R package for data mining
### create data partition for traing & testing
```{r}
#| warning: false
#| message: false
#| echo: false
sr=2/3
tmp=sample(1:nrow(d),nrow(d)*sr)
#tmp
d1=d[tmp,]
d2=d[-tmp,]
nrow(d1);nrow(d2)
```
### random forest using caret
```{r}
#| warning: false
#| message: false
#| echo: false
rfModel <- train(species ~ ., data = d1, method = "rf", trControl = trainControl(method = "cv", number = 10))
rfModel
predictions <- predict(rfModel, newdata = d2)
confusionMatrix(predictions, d2$species)
```
### decision tree model
```{r}
#| warning: false
#| message: false
#| echo: false
dtModel <- train(species ~ ., data = d1, method = "rpart", trControl = trainControl(method = "cv", number = 10))
# Predict the species of the test set
predictions <- predict(dtModel, newdata = d2)
# Evaluate the model
confusionMatrix(predictions, d2$species)
```
### svm model
```{r}
#| warning: false
#| message: false
#| echo: false
svmModel <- train(species ~ ., data = d1, method = "svmRadial", trControl = trainControl(method = "cv", number = 10))
# Predict the species of the test set
predictions <- predict(svmModel, newdata = d2)
# Evaluate the model
confusionMatrix(predictions, d2$species)
```
### nueral network model
```{r}
#| warning: false
#| message: false
#| echo: false
nnModel <- train(species ~ ., data = d1, method = "nnet", trControl = trainControl(method = "cv", number = 10),trace=F)
print(nnModel)
plot(nnModel)
plotnet(nnModel)
predictions <- predict(nnModel, newdata = d2)
# Evaluate the model
confusionMatrix(predictions, d2$species)
```
### knn model
```{r}
#| warning: false
#| message: false
#| echo: false
knnFit1 <- train( species~.,d1,
method = "knn",
preProcess = c("center", "scale"),
tuneLength = 10,
trControl = trainControl(method = "cv"))
predictions <- predict(nnModel, newdata = d2)
# Evaluate the model
confusionMatrix(predictions, d2$species)
```
### variable selection/importance
```{r}
#| warning: false
#| message: false
#| echo: false
br=Boruta::Boruta(species~.,d)
br
as=Boruta::attStats(br)|>janitor::clean_names()
str(as)
as|>arrange(desc(mean_imp))|> filter(!decision=="rejected")
```