library(tidyverse)
library(janitor)
library(psych)
library(rsample)
iris_data <- read_csv("iris.csv") %>%
clean_names()
dim(iris_data)
## [1] 150 5
names(iris_data)
## [1] "x1" "sepal_length_cm" "sepal_width_cm" "petal_length_cm"
## [5] "petal_width_cm"
head(iris_data, 15)
normalize <- function(x) {
(x - min(x, na.rm = TRUE)) / (max(x, na.rm = TRUE) - min(x, na.rm = TRUE))
}
iris_data <- iris_data %>%
mutate(sepal_length_norm = normalize(sepal_length_cm))
iris_filtered <- iris_data %>%
filter(petal_length_cm > 4)
head(iris_filtered, 10)
iris_no_na <- iris_data %>% drop_na()
sum(!complete.cases(iris_no_na))
## [1] 0
iris_dedup <- iris_no_na %>% distinct()
nrow(iris_no_na) - nrow(iris_dedup)
## [1] 0
iris_ordered <- iris_dedup %>% arrange(desc(petal_width_cm), desc(sepal_length_cm))
head(iris_ordered, 10)
iris_renamed <- iris_dedup %>%
rename(Sepal_Length = sepal_length_cm,
Sepal_Width = sepal_width_cm,
Petal_Length = petal_length_cm,
Petal_Width = petal_width_cm)
names(iris_renamed)
## [1] "x1" "Sepal_Length" "Sepal_Width"
## [4] "Petal_Length" "Petal_Width" "sepal_length_norm"
iris_augmented <- iris_renamed %>%
mutate(double_petal_length = Petal_Length * 2,
sepal_area_approx = Sepal_Length * Sepal_Width)
head(iris_augmented, 10)
split_obj <- initial_split(iris_augmented, prop = 0.7)
train_df <- training(split_obj)
test_df <- testing(split_obj)
nrow(train_df); nrow(test_df)
## [1] 102
## [1] 45
summary(iris_augmented)
## x1 Sepal_Length Sepal_Width Petal_Length
## Min. : 0.00 Min. :4.300 Min. :2.00 Min. :1.00
## 1st Qu.: 38.50 1st Qu.:5.100 1st Qu.:2.80 1st Qu.:1.60
## Median : 76.00 Median :5.800 Median :3.00 Median :4.40
## Mean : 75.22 Mean :5.854 Mean :3.05 Mean :3.78
## 3rd Qu.:112.50 3rd Qu.:6.400 3rd Qu.:3.30 3rd Qu.:5.10
## Max. :149.00 Max. :7.900 Max. :4.40 Max. :6.90
## Petal_Width sepal_length_norm double_petal_length sepal_area_approx
## Min. : 0.100 Min. :0.0000 Min. : 2.000 Min. :10.00
## 1st Qu.: 0.300 1st Qu.:0.2222 1st Qu.: 3.200 1st Qu.:15.65
## Median : 1.300 Median :0.4167 Median : 8.800 Median :17.68
## Mean : 1.297 Mean :0.4316 Mean : 7.561 Mean :17.81
## 3rd Qu.: 1.800 3rd Qu.:0.5833 3rd Qu.:10.200 3rd Qu.:20.25
## Max. :14.000 Max. :1.0000 Max. :13.800 Max. :30.02
x <- iris_augmented$Sepal_Length
mean(x)
## [1] 5.853741
median(x)
## [1] 5.8
mode_stat <- function(v) {
v <- v[!is.na(v)]
u <- unique(v)
u[which.max(tabulate(match(v, u)))]
}
mode_stat(x)
## [1] 5
range(x)
## [1] 4.3 7.9
ggplot(iris_augmented, aes(x = Sepal_Length, y = Petal_Length)) +
geom_point(alpha = 0.8, color = "steelblue") +
labs(title = "Scatter: Sepal Length vs Petal Length") +
theme_minimal()
iris_augmented %>%
mutate(sepal_length_round = round(Sepal_Length)) %>%
count(sepal_length_round) %>%
ggplot(aes(x = sepal_length_round, y = n)) +
geom_col(fill = "darkorange") +
labs(title = "Bar: Frequency of Rounded Sepal Lengths",
x = "Sepal Length (rounded)", y = "Count") +
theme_minimal()
cor(iris_augmented$Sepal_Length, iris_augmented$Petal_Length, method = "pearson")
## [1] 0.8692145
Dataset source: Kaggle — Iris Dataset (iris.csv)
GitHub repository: https://github.com/bharatchopra-tech/irisdata-analysisGroup1