Data arrangement and cleaning
Clean data: rename variables, find/correct/remove missing values, outlier ID
cleaned_data <- plant_data %>%
rename(
ID = plant_ID,
Height_cm = height,
Diameter_cm = diameter,
Diameter2_cm = diameter2,
Stem_Count = stems,
Resprout_Count = resprouts,
Resprout_Length_cm = resprout_length,
Gall_index = gall
) %>%
mutate(
Diameter_avg = rowMeans(select(., Diameter_cm, Diameter2_cm), na.rm = TRUE),
Stem_Count = replace_na(Stem_Count, 0)
)
identify_outliers <- function(data, column) {
Q1 <- quantile(data[[column]], 0.25, na.rm = TRUE)
Q3 <- quantile(data[[column]], 0.75, na.rm = TRUE)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
outliers <- data %>%
filter((!!sym(column) < lower_bound) | (!!sym(column) > upper_bound))
return(outliers)
}
height_outliers <- identify_outliers(cleaned_data, "Height_cm")
print(height_outliers)
## # A tibble: 0 × 9
## # ℹ 9 variables: ID <dbl>, Height_cm <dbl>, Diameter_cm <dbl>,
## # Diameter2_cm <dbl>, Stem_Count <dbl>, Resprout_Count <chr>,
## # Resprout_Length_cm <chr>, Gall_index <dbl>, Diameter_avg <dbl>
D1_outliers <- identify_outliers(cleaned_data, "Diameter_cm")
print(D1_outliers)
## # A tibble: 0 × 9
## # ℹ 9 variables: ID <dbl>, Height_cm <dbl>, Diameter_cm <dbl>,
## # Diameter2_cm <dbl>, Stem_Count <dbl>, Resprout_Count <chr>,
## # Resprout_Length_cm <chr>, Gall_index <dbl>, Diameter_avg <dbl>
D2_outliers <- identify_outliers(cleaned_data, "Diameter2_cm")
print(D2_outliers)
## # A tibble: 0 × 9
## # ℹ 9 variables: ID <dbl>, Height_cm <dbl>, Diameter_cm <dbl>,
## # Diameter2_cm <dbl>, Stem_Count <dbl>, Resprout_Count <chr>,
## # Resprout_Length_cm <chr>, Gall_index <dbl>, Diameter_avg <dbl>
stem_outliers <- identify_outliers(cleaned_data, "Stem_Count")
print(stem_outliers)
## # A tibble: 5 × 9
## ID Height_cm Diameter_cm Diameter2_cm Stem_Count Resprout_Count
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 14 54 58 38 18 >10
## 2 19 53 74 47 18 >10
## 3 27 50 42 32 23 >10
## 4 29 58 40 30 13 >10
## 5 37 57 91 81 14 >10
## # ℹ 3 more variables: Resprout_Length_cm <chr>, Gall_index <dbl>,
## # Diameter_avg <dbl>
Gall_outliers <- identify_outliers(cleaned_data, "Gall_index")
print(Gall_outliers)
## # A tibble: 10 × 9
## ID Height_cm Diameter_cm Diameter2_cm Stem_Count Resprout_Count
## <dbl> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 4 124 181 112 5 0
## 2 12 130 134 108 3 1
## 3 12 130 134 108 3 1
## 4 19 53 74 47 18 >10
## 5 24 71 129 70 2 0
## 6 215 63 117 55 3 0
## 7 37 57 91 81 14 >10
## 8 38 36 27 23 8 8
## 9 38 36 27 23 8 8
## 10 40 70 81 74 4 0
## # ℹ 3 more variables: Resprout_Length_cm <chr>, Gall_index <dbl>,
## # Diameter_avg <dbl>
Summary statistics on clean data
## # A tibble: 1 × 4
## Avg_Height Avg_Diameter Avg_Stem_Count Avg_Gall_Index
## <dbl> <dbl> <dbl> <dbl>
## 1 44.1 47.7 3.8 0.125
Data distributions (negative skew due to not dropping “zero” entries)