Load and Inspect the Iris Dataset Start by loading the built-in iris dataset, inspecting its structure, and checking for missing values.
data(iris)
head(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
colSums(is.na(iris))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 0 0 0 0 0
Data Preparation Convert Species to a factor and clean up column names.
iris$Species <- as.factor(iris$Species)
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
names(iris) <- tolower(names(iris))
names(iris) <- gsub("\\.", "_", names(iris))
Remove duplicate rows and create a new variable for the petal length to width ratio.
iris <- iris[!duplicated(iris),]
install.packages("dplyr")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Create new variable
``` r
iris <- mutate(iris, petal_length_width_ratio = petal_length / petal_width)
View and Summarize the Data Inspect the modified iris dataset.
View(iris)
summary(iris)
## sepal_length sepal_width petal_length petal_width species
## Min. :4.300 Min. :2.00 Min. :1.000 Min. :0.100 setosa :50
## 1st Qu.:5.100 1st Qu.:2.80 1st Qu.:1.600 1st Qu.:0.300 versicolor:50
## Median :5.800 Median :3.00 Median :4.300 Median :1.300 virginica :49
## Mean :5.844 Mean :3.06 Mean :3.749 Mean :1.195
## 3rd Qu.:6.400 3rd Qu.:3.30 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.40 Max. :6.900 Max. :2.500
## petal_length_width_ratio
## Min. : 2.125
## 1st Qu.: 2.810
## Median : 3.300
## Mean : 4.321
## 3rd Qu.: 4.667
## Max. :15.000
str(iris)
## 'data.frame': 149 obs. of 6 variables:
## $ sepal_length : num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ sepal_width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ petal_length : num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ petal_width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ petal_length_width_ratio: num 7 7 6.5 7.5 7 ...
View(iris)
Save the Cleaned Dataset Save the cleaned iris dataset to a CSV file and reload it to confirm.
write.csv(iris, "cleaned_iris_data.csv", row.names = FALSE)
iris_data <- read.csv("cleaned_iris_data.csv")
Install and Load Visualization Packages Install and load the ggplot2 and plotly packages.
install.packages("ggplot2")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(ggplot2)
install.packages("plotly")
## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Box Plot Using Plotly Create a box plot using plotly.
fig <- plot_ly(midwest, x = ~percollege, color = ~state, type = "box")
fig Scatter Plot of Sepal Dimensions Create a scatter plot of Sepal Length vs. Sepal Width, colored by species.
ggplot(iris, aes(x = sepal_length, y = sepal_width, color = species)) +
geom_point() +
xlab("Sepal Length") +
ylab("Sepal Width") +
ggtitle("Scatter Plot of Sepal Length vs. Sepal Width")
Histogram of Petal Length Create a histogram of Petal Length.
ggplot(iris, aes(x = petal_length)) +
geom_histogram(binwidth = 0.5, fill = "blue", color = "black") +
xlab("Petal Length") +
ylab("Count") +
ggtitle("Histogram of Petal Length")
Density Plot of Petal Length by Species Create a density plot of Petal
Length by Species.
ggplot(iris, aes(x = petal_length, fill = species)) +
geom_density(alpha = 0.5) +
xlab("Petal Length") +
ylab("Density") +
ggtitle("Density Plot of Petal Length by Species")
Pairwise Plot Matrix Create a pairwise plot matrix using GGally’s
ggpairs function.
#install.packages("GGally")
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(iris, aes(color = species))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.