Load the Dataset

# Load the Iris dataset
data(iris)

Operation 1: Find maximum and minimum values of each feature for each species

max_values <- aggregate(. ~ Species, data=iris, FUN=max)
min_values <- aggregate(. ~ Species, data=iris, FUN=min)
# Printing maximum values
cat("Maximum Values:\n")

## Maximum Values:

print(max_values)

##      Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     setosa          5.8         4.4          1.9         0.6
## 2 versicolor          7.0         3.4          5.1         1.8
## 3  virginica          7.9         3.8          6.9         2.5

# Printing minimum values
cat("Minimum Values:\n")

## Minimum Values:

print(min_values)

##      Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1     setosa          4.3         2.3          1.0         0.1
## 2 versicolor          4.9         2.0          3.0         1.0
## 3  virginica          4.9         2.2          4.5         1.4

Operation 2: Calculate standard deviation of Sepal Length for each species

std_dev_sep_length <- aggregate(Sepal.Length ~ Species, data=iris, FUN=sd)
print("Standard Deviation of Sepal Length:")

## [1] "Standard Deviation of Sepal Length:"

print(std_dev_sep_length)

##      Species Sepal.Length
## 1     setosa    0.3524897
## 2 versicolor    0.5161711
## 3  virginica    0.6358796

Operation 3: Select and plot subset of data for a setosa and Petal.Length

species_subset <- "setosa"
feature_subset <- "Petal.Length"
subset_data <- subset(iris, Species == species_subset)
plot(subset_data[, feature_subset], main=paste("Subset of", species_subset, "for", feature_subset))

Operation 4: Check for missing values

missing_values <- sum(is.na(iris))
print("Missing Values:")

## [1] "Missing Values:"

print(missing_values)

## [1] 0

No missing values found.

Operation 5: Checking for erroneous values

# Checking for erroneous values in Sepal Length
erroneous_sepal_length <- iris$Sepal.Length[iris$Sepal.Length < 0 | iris$Sepal.Length > 10]

# Checking for erroneous values in Sepal Width
erroneous_sepal_width <- iris$Sepal.Width[iris$Sepal.Width < 0 | iris$Sepal.Width > 10]

# Checking for erroneous values in Petal Length
erroneous_petal_length <- iris$Petal.Length[iris$Petal.Length < 0 | iris$Petal.Length > 10]

# Checking for erroneous values in Petal Width
erroneous_petal_width <- iris$Petal.Width[iris$Petal.Width < 0 | iris$Petal.Width > 10]

# Combining all erroneous values
all_erroneous_values <- c(erroneous_sepal_length, erroneous_sepal_width, erroneous_petal_length, erroneous_petal_width)

# Printing any erroneous values found
if (length(all_erroneous_values) > 0) {
  cat("Erroneous Values Found:\n")
  print(all_erroneous_values)
} else {
  cat("No Erroneous Values Found.\n")
}

## No Erroneous Values Found.

Operation 6: Create new feature representing ratio of Petal Length to Sepal Length

# Calculating the ratios
iris_with_ratio <- iris
iris_with_ratio$Petal_Sepal_Ratio <- iris_with_ratio$Petal.Length / iris_with_ratio$Sepal.Length

iris_with_ratio$Petal_Sepal_Ratio

##   [1] 0.2745098 0.2857143 0.2765957 0.3260870 0.2800000 0.3148148 0.3043478
##   [8] 0.3000000 0.3181818 0.3061224 0.2777778 0.3333333 0.2916667 0.2558140
##  [15] 0.2068966 0.2631579 0.2407407 0.2745098 0.2982456 0.2941176 0.3148148
##  [22] 0.2941176 0.2173913 0.3333333 0.3958333 0.3200000 0.3200000 0.2884615
##  [29] 0.2692308 0.3404255 0.3333333 0.2777778 0.2884615 0.2545455 0.3061224
##  [36] 0.2400000 0.2363636 0.2857143 0.2954545 0.2941176 0.2600000 0.2888889
##  [43] 0.2954545 0.3200000 0.3725490 0.2916667 0.3137255 0.3043478 0.2830189
##  [50] 0.2800000 0.6714286 0.7031250 0.7101449 0.7272727 0.7076923 0.7894737
##  [57] 0.7460317 0.6734694 0.6969697 0.7500000 0.7000000 0.7118644 0.6666667
##  [64] 0.7704918 0.6428571 0.6567164 0.8035714 0.7068966 0.7258065 0.6964286
##  [71] 0.8135593 0.6557377 0.7777778 0.7704918 0.6718750 0.6666667 0.7058824
##  [78] 0.7462687 0.7500000 0.6140351 0.6909091 0.6727273 0.6724138 0.8500000
##  [85] 0.8333333 0.7500000 0.7014925 0.6984127 0.7321429 0.7272727 0.8000000
##  [92] 0.7540984 0.6896552 0.6600000 0.7500000 0.7368421 0.7368421 0.6935484
##  [99] 0.5882353 0.7192982 0.9523810 0.8793103 0.8309859 0.8888889 0.8923077
## [106] 0.8684211 0.9183673 0.8630137 0.8656716 0.8472222 0.7846154 0.8281250
## [113] 0.8088235 0.8771930 0.8793103 0.8281250 0.8461538 0.8701299 0.8961039
## [120] 0.8333333 0.8260870 0.8750000 0.8701299 0.7777778 0.8507463 0.8333333
## [127] 0.7741935 0.8032787 0.8750000 0.8055556 0.8243243 0.8101266 0.8750000
## [134] 0.8095238 0.9180328 0.7922078 0.8888889 0.8593750 0.8000000 0.7826087
## [141] 0.8358209 0.7391304 0.8793103 0.8676471 0.8507463 0.7761194 0.7936508
## [148] 0.8000000 0.8709677 0.8644068

Operation 7: Plot boxplot of Petal_Sepal_Ratio for each species

library(ggplot2)

ggplot(iris_with_ratio, aes(x = Species, y = Petal_Sepal_Ratio)) +
  geom_boxplot(fill = "lightblue") +
  labs(title = "Distribution of Petal Sepal Ratio by Species", x = "Species", y = "Petal Sepal Ratio")

Operation 8: Identify and remove outliers using z-score threshold

# Calculating z-scores for each feature
z_scores <- scale(iris[, 1:4])

# Identifying outliers using z-scores
outliers <- rowSums(abs(z_scores) > 3) > 0

# Counting the number of outliers
num_outliers <- sum(outliers)
cat("Number of Outliers Found:", num_outliers, "\n")

## Number of Outliers Found: 1

# Printing cleaned data without outliers
cleaned_data <- iris[!outliers, ]
cat("Cleaned Data (without outliers):\n")

## Cleaned Data (without outliers):

print(head(cleaned_data))

##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1          5.1         3.5          1.4         0.2  setosa
## 2          4.9         3.0          1.4         0.2  setosa
## 3          4.7         3.2          1.3         0.2  setosa
## 4          4.6         3.1          1.5         0.2  setosa
## 5          5.0         3.6          1.4         0.2  setosa
## 6          5.4         3.9          1.7         0.4  setosa

1 outlier exists. We have removed it.

Operation 9: Perform data normalization

normalized_data <- as.data.frame(scale(iris[, 1:4]))
# Printing normalized data (first few rows)
print("Normalized Data (First Few Rows):")

## [1] "Normalized Data (First Few Rows):"

print(head(normalized_data))

##   Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1   -0.8976739  1.01560199    -1.335752   -1.311052
## 2   -1.1392005 -0.13153881    -1.335752   -1.311052
## 3   -1.3807271  0.32731751    -1.392399   -1.311052
## 4   -1.5014904  0.09788935    -1.279104   -1.311052
## 5   -1.0184372  1.24503015    -1.335752   -1.311052
## 6   -0.5353840  1.93331463    -1.165809   -1.048667

Operation 10: Aggregate dataset to calculate total count of each species

species_count <- aggregate(Sepal.Length ~ Species, data=iris, FUN=length)

# Printing total count of each species
print("Total Count of Each Species:")

## [1] "Total Count of Each Species:"

print(species_count)

##      Species Sepal.Length
## 1     setosa           50
## 2 versicolor           50
## 3  virginica           50

IDS Mini Project

2024-02-27

Load the Dataset

Operation 1: Find maximum and minimum values of each feature for each species

Operation 2: Calculate standard deviation of Sepal Length for each species

Operation 3: Select and plot subset of data for a setosa and Petal.Length

Operation 4: Check for missing values

Operation 5: Checking for erroneous values

Operation 6: Create new feature representing ratio of Petal Length to Sepal Length

Operation 7: Plot boxplot of Petal_Sepal_Ratio for each species

Operation 8: Identify and remove outliers using z-score threshold

Operation 9: Perform data normalization

Operation 10: Aggregate dataset to calculate total count of each species