Load Packages

library(tidyverse)
library(janitor)
library(psych)
library(rsample)

Import Data

iris_data <- read_csv("iris.csv") %>%
  clean_names()

dim(iris_data)
## [1] 150   5
names(iris_data)
## [1] "x1"              "sepal_length_cm" "sepal_width_cm"  "petal_length_cm"
## [5] "petal_width_cm"
head(iris_data, 15)

User-defined Function

normalize <- function(x) {
  (x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
}

iris_data <- iris_data %>%
  mutate(sepal_length_norm = normalize(sepal_length_cm))

Filtering Rows

iris_filtered <- iris_data %>%
  filter(petal_length_cm > 4)

head(iris_filtered, 10)

Remove Missing Values

iris_no_na <- iris_data %>% drop_na()
sum(!complete.cases(iris_no_na))
## [1] 0

Remove Duplicates

iris_dedup <- iris_no_na %>% distinct()
nrow(iris_no_na) - nrow(iris_dedup)
## [1] 0

Reorder Rows

iris_ordered <- iris_dedup %>% arrange(desc(petal_width_cm), desc(sepal_length_cm))
head(iris_ordered, 10)

Rename Columns

iris_renamed <- iris_dedup %>%
  rename(Sepal_Length = sepal_length_cm,
         Sepal_Width  = sepal_width_cm,
         Petal_Length = petal_length_cm,
         Petal_Width  = petal_width_cm)

names(iris_renamed)
## [1] "x1"                "Sepal_Length"      "Sepal_Width"      
## [4] "Petal_Length"      "Petal_Width"       "sepal_length_norm"

Add New Variables

iris_augmented <- iris_renamed %>%
  mutate(double_petal_length = Petal_Length * 2,
         sepal_area_approx = Sepal_Length * Sepal_Width)

head(iris_augmented, 10)

Training and Testing Split

split_obj <- initial_split(iris_augmented, prop = 0.7)
train_df <- training(split_obj)
test_df  <- testing(split_obj)

nrow(train_df); nrow(test_df)
## [1] 102
## [1] 45

Summary Statistics

summary(iris_augmented)
##        x1          Sepal_Length    Sepal_Width    Petal_Length 
##  Min.   :  0.00   Min.   :4.300   Min.   :2.00   Min.   :1.00  
##  1st Qu.: 38.50   1st Qu.:5.100   1st Qu.:2.80   1st Qu.:1.60  
##  Median : 76.00   Median :5.800   Median :3.00   Median :4.40  
##  Mean   : 75.22   Mean   :5.854   Mean   :3.05   Mean   :3.78  
##  3rd Qu.:112.50   3rd Qu.:6.400   3rd Qu.:3.30   3rd Qu.:5.10  
##  Max.   :149.00   Max.   :7.900   Max.   :4.40   Max.   :6.90  
##   Petal_Width     sepal_length_norm double_petal_length sepal_area_approx
##  Min.   : 0.100   Min.   :0.0000    Min.   : 2.000      Min.   :10.00    
##  1st Qu.: 0.300   1st Qu.:0.2222    1st Qu.: 3.200      1st Qu.:15.65    
##  Median : 1.300   Median :0.4167    Median : 8.800      Median :17.68    
##  Mean   : 1.297   Mean   :0.4316    Mean   : 7.561      Mean   :17.81    
##  3rd Qu.: 1.800   3rd Qu.:0.5833    3rd Qu.:10.200      3rd Qu.:20.25    
##  Max.   :14.000   Max.   :1.0000    Max.   :13.800      Max.   :30.02

Mean, Median, Mode, Range

x <- iris_augmented$Sepal_Length

mean(x)
## [1] 5.853741
median(x)
## [1] 5.8
mode_stat <- function(v) {
  v <- v[!is.na(v)]
  u <- unique(v)
  u[which.max(tabulate(match(v, u)))]
}
mode_stat(x)
## [1] 5
range(x)
## [1] 4.3 7.9

Scatter Plot

ggplot(iris_augmented, aes(x = Sepal_Length, y = Petal_Length)) +
  geom_point(alpha = 0.8, color = "steelblue") +
  labs(title = "Scatter: Sepal Length vs Petal Length") +
  theme_minimal()

Bar Plot (Example: Frequency of Rounded Sepal Lengths)

iris_augmented %>%
  mutate(sepal_length_round = round(Sepal_Length)) %>%
  count(sepal_length_round) %>%
  ggplot(aes(x = sepal_length_round, y = n)) +
  geom_col(fill = "darkorange") +
  labs(title = "Bar: Frequency of Rounded Sepal Lengths",
       x = "Sepal Length (rounded)", y = "Count") +
  theme_minimal()

Pearson Correlation

cor(iris_augmented$Sepal_Length, iris_augmented$Petal_Length, method = "pearson")
## [1] 0.8692145

References

Dataset source: Kaggle — Iris Dataset (iris.csv)
GitHub repository: https://github.com/bharatchopra-tech/irisdata-analysisGroup1