Load and Inspect the Iris Dataset Start by loading the built-in iris dataset, inspecting its structure, and checking for missing values.

data(iris)
head(iris)

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

colSums(is.na(iris))

## Sepal.Length  Sepal.Width Petal.Length  Petal.Width      Species 
##            0            0            0            0            0

Data Preparation Convert Species to a factor and clean up column names.

iris$Species <- as.factor(iris$Species)
str(iris)

## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...

Convert column names to lowercase

names(iris) <- tolower(names(iris))

Replace periods in column names with underscores

names(iris) <- gsub("\\.", "_", names(iris))

Remove duplicate rows and create a new variable for the petal length to width ratio.

iris <- iris[!duplicated(iris),]

Load dplyr package

install.packages("dplyr")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Create new variable

``` r
iris <- mutate(iris, petal_length_width_ratio = petal_length / petal_width)

View and Summarize the Data Inspect the modified iris dataset.

View(iris)

summary(iris)

##   sepal_length    sepal_width    petal_length    petal_width          species  
##  Min.   :4.300   Min.   :2.00   Min.   :1.000   Min.   :0.100   setosa    :50  
##  1st Qu.:5.100   1st Qu.:2.80   1st Qu.:1.600   1st Qu.:0.300   versicolor:50  
##  Median :5.800   Median :3.00   Median :4.300   Median :1.300   virginica :49  
##  Mean   :5.844   Mean   :3.06   Mean   :3.749   Mean   :1.195                  
##  3rd Qu.:6.400   3rd Qu.:3.30   3rd Qu.:5.100   3rd Qu.:1.800                  
##  Max.   :7.900   Max.   :4.40   Max.   :6.900   Max.   :2.500                  
##  petal_length_width_ratio
##  Min.   : 2.125          
##  1st Qu.: 2.810          
##  Median : 3.300          
##  Mean   : 4.321          
##  3rd Qu.: 4.667          
##  Max.   :15.000

str(iris)

## 'data.frame':    149 obs. of  6 variables:
##  $ sepal_length            : num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ sepal_width             : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ petal_length            : num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ petal_width             : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ species                 : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ petal_length_width_ratio: num  7 7 6.5 7.5 7 ...

View(iris)

Save the Cleaned Dataset Save the cleaned iris dataset to a CSV file and reload it to confirm.

write.csv(iris, "cleaned_iris_data.csv", row.names = FALSE)
iris_data <- read.csv("cleaned_iris_data.csv")

Install and Load Visualization Packages Install and load the ggplot2 and plotly packages.

install.packages("ggplot2")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)

library(ggplot2)

install.packages("plotly")

## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.4'
## (as 'lib' is unspecified)

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

Box Plot Using Plotly Create a box plot using plotly.

fig <- plot_ly(midwest, x = ~percollege, color = ~state, type = "box")

fig Scatter Plot of Sepal Dimensions Create a scatter plot of Sepal Length vs. Sepal Width, colored by species.

ggplot(iris, aes(x = sepal_length, y = sepal_width, color = species)) +
  geom_point() +
  xlab("Sepal Length") +
  ylab("Sepal Width") +
  ggtitle("Scatter Plot of Sepal Length vs. Sepal Width")

Histogram of Petal Length Create a histogram of Petal Length.

ggplot(iris, aes(x = petal_length)) +
  geom_histogram(binwidth = 0.5, fill = "blue", color = "black") +
  xlab("Petal Length") +
  ylab("Count") +
  ggtitle("Histogram of Petal Length")

Density Plot of Petal Length by Species Create a density plot of Petal Length by Species.

ggplot(iris, aes(x = petal_length, fill = species)) +
  geom_density(alpha = 0.5) +
  xlab("Petal Length") +
  ylab("Density") +
  ggtitle("Density Plot of Petal Length by Species")

Pairwise Plot Matrix Create a pairwise plot matrix using GGally’s ggpairs function.

#install.packages("GGally")
library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

ggpairs(iris, aes(color = species))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Iris_data_cleaning_and_visualization

zahra_rezakhah

2024-08-09

Convert column names to lowercase

Replace periods in column names with underscores

Load dplyr package