Iris Data Analysis Project

Load Packages

library(tidyverse)
library(janitor)
library(psych)
library(rsample)

Import Data

iris_data <- read_csv("iris.csv") %>%
  clean_names()

dim(iris_data)

## [1] 150   5

names(iris_data)

## [1] "x1"              "sepal_length_cm" "sepal_width_cm"  "petal_length_cm"
## [5] "petal_width_cm"

head(iris_data, 15)

User-defined Function

normalize <- function(x) {
  (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
}

iris_data <- iris_data %>%
  mutate(sepal_length_norm = normalize(sepal_length_cm))

Filtering Rows

iris_filtered <- iris_data %>%
  filter(petal_length_cm > 4)

head(iris_filtered, 10)

Remove Missing Values

iris_no_na <- iris_data %>% drop_na()
sum(!complete.cases(iris_no_na))

## [1] 0

Remove Duplicates

iris_dedup <- iris_no_na %>% distinct()
nrow(iris_no_na) - nrow(iris_dedup)

## [1] 0

Reorder Rows

iris_ordered <- iris_dedup %>% arrange(desc(petal_width_cm), desc(sepal_length_cm))
head(iris_ordered, 10)

Rename Columns

iris_renamed <- iris_dedup %>%
  rename(Sepal_Length = sepal_length_cm,
         Sepal_Width  = sepal_width_cm,
         Petal_Length = petal_length_cm,
         Petal_Width  = petal_width_cm)

names(iris_renamed)

## [1] "x1"                "Sepal_Length"      "Sepal_Width"      
## [4] "Petal_Length"      "Petal_Width"       "sepal_length_norm"

Add New Variables

iris_augmented <- iris_renamed %>%
  mutate(double_petal_length = Petal_Length * 2,
         sepal_area_approx = Sepal_Length * Sepal_Width)

head(iris_augmented, 10)

Training and Testing Split

split_obj <- initial_split(iris_augmented, prop = 0.7)
train_df <- training(split_obj)
test_df  <- testing(split_obj)

nrow(train_df); nrow(test_df)

## [1] 102

## [1] 45

Summary Statistics

summary(iris_augmented)

##        x1          Sepal_Length    Sepal_Width    Petal_Length 
##  Min.   :  0.00   Min.   :4.300   Min.   :2.00   Min.   :1.00  
##  1st Qu.: 38.50   1st Qu.:5.100   1st Qu.:2.80   1st Qu.:1.60  
##  Median : 76.00   Median :5.800   Median :3.00   Median :4.40  
##  Mean   : 75.22   Mean   :5.854   Mean   :3.05   Mean   :3.78  
##  3rd Qu.:112.50   3rd Qu.:6.400   3rd Qu.:3.30   3rd Qu.:5.10  
##  Max.   :149.00   Max.   :7.900   Max.   :4.40   Max.   :6.90  
##   Petal_Width     sepal_length_norm double_petal_length sepal_area_approx
##  Min.   : 0.100   Min.   :0.0000    Min.   : 2.000      Min.   :10.00    
##  1st Qu.: 0.300   1st Qu.:0.2222    1st Qu.: 3.200      1st Qu.:15.65    
##  Median : 1.300   Median :0.4167    Median : 8.800      Median :17.68    
##  Mean   : 1.297   Mean   :0.4316    Mean   : 7.561      Mean   :17.81    
##  3rd Qu.: 1.800   3rd Qu.:0.5833    3rd Qu.:10.200      3rd Qu.:20.25    
##  Max.   :14.000   Max.   :1.0000    Max.   :13.800      Max.   :30.02

Mean, Median, Mode, Range

x <- iris_augmented$Sepal_Length

mean(x)

## [1] 5.853741

median(x)

## [1] 5.8

mode_stat <- function(v) {
  v <- v[!is.na(v)]
  u <- unique(v)
  u[which.max(tabulate(match(v, u)))]
}
mode_stat(x)

## [1] 5

range(x)

## [1] 4.3 7.9

Scatter Plot

ggplot(iris_augmented, aes(x = Sepal_Length, y = Petal_Length)) +
  geom_point(alpha = 0.8, color = "steelblue") +
  labs(title = "Scatter: Sepal Length vs Petal Length") +
  theme_minimal()

Bar Plot (Example: Frequency of Rounded Sepal Lengths)

iris_augmented %>%
  mutate(sepal_length_round = round(Sepal_Length)) %>%
  count(sepal_length_round) %>%
  ggplot(aes(x = sepal_length_round, y = n)) +
  geom_col(fill = "darkorange") +
  labs(title = "Bar: Frequency of Rounded Sepal Lengths",
       x = "Sepal Length (rounded)", y = "Count") +
  theme_minimal()

Pearson Correlation

cor(iris_augmented$Sepal_Length, iris_augmented$Petal_Length, method = "pearson")

## [1] 0.8692145

References

Dataset source: Kaggle — Iris Dataset (iris.csv)
GitHub repository: https://github.com/bharatchopra-tech/irisdata-analysisGroup1

Iris Data Analysis Project

Group 1

2025-11-29

Load Packages

Import Data

User-defined Function

Filtering Rows

Remove Missing Values

Remove Duplicates

Reorder Rows

Rename Columns

Add New Variables

Training and Testing Split

Summary Statistics

Mean, Median, Mode, Range

Scatter Plot

Bar Plot (Example: Frequency of Rounded Sepal Lengths)

Pearson Correlation

References