# Load necessary libraries
library(MASS)
library(ggplot2)
library(cluster)
library(gridExtra)
library(psych)
You can also embed plots, for example:
# Set seed for reproducibility
set.seed(123)
# Generate synthetic data
n <- 100
soil_quality <- rnorm(n, mean=5, sd=2)
rain <- rnorm(n, mean=100, sd=20)
pesticide <- rnorm(n, mean=3, sd=1)
fertilizer <- rnorm(n, mean=50, sd=10)
labor <- rnorm(n, mean=20, sd=5)
crop_yield <- 10 + 2 * soil_quality + 0.3 * rain + 1.5 * pesticide + 0.5 * fertilizer + 0.8 * labor + rnorm(n, mean=0, sd=5)
data <- data.frame(crop_yield, soil_quality, rain, pesticide, fertilizer, labor)
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.
# Kernel density estimation for conditional density
kde <- MASS::kde2d(data$soil_quality, data$crop_yield, n=50)
# Plot the conditional density
filled.contour(kde, color.palette=colorRampPalette(c("white", "blue", "red")),
xlab="Soil Quality", ylab="Crop Yield", main="Conditional Density of Crop Yield Given Soil Quality")
# Perform PCA on the dataset
data_pca <- prcomp(data, scale. = TRUE)
# Plot the PCA results
pca_df <- data.frame(data_pca$x)
ggplot(pca_df, aes(x = PC1, y = PC2)) +
geom_point() +
ggtitle("PCA of Crop Yield Data") +
xlab("Principal Component 1") +
ylab("Principal Component 2")
# Perform k-means clustering
set.seed(123)
kmeans_result <- kmeans(scale(data), centers = 3)
# Add cluster information to the PCA data frame
pca_df$cluster <- as.factor(kmeans_result$cluster)
# Plot the clustering results on the PCA
ggplot(pca_df, aes(x = PC1, y = PC2, color = cluster)) +
geom_point() +
ggtitle("K-means Clustering on PCA of Crop Yield Data") +
xlab("Principal Component 1") +
ylab("Principal Component 2")
# Silhouette Analysis
dist_mat <- dist(scale(data)) # Using the full data distance matrix
# Check if silhouette object is created correctly
sil <- silhouette(kmeans_result$cluster, dist_mat)
print(summary(sil)) # Print summary to check if silhouette object is valid
## Silhouette of 100 units in 3 clusters from silhouette.default(x = kmeans_result$cluster, dist = dist_mat) :
## Cluster sizes and average silhouette widths:
## 38 34 28
## 0.1741499 0.1765157 0.1411486
## Individual silhouette widths:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00723 0.07491 0.16593 0.16571 0.24573 0.38606
# Plot the silhouette analysis
plot(sil, main = "Silhouette Plot for K-means Clustering", col = 1:3, border = NA)
# Additional PCA and clustering plot
ggplot(pca_df, aes(x = PC1, y = PC2, color = cluster)) +
geom_point(size = 2) +
theme_minimal() +
labs(title = "Clustering Results on Principal Components",
x = "Principal Component 1",
y = "Principal Component 2",
color = "Cluster")
# Plotting Crop Yield vs. Fertilizer
# Base R
plot(data$fertilizer, data$crop_yield,
main = "Crop Yield vs. Fertilizer",
xlab = "Fertilizer", ylab = "Crop Yield",
bg = "black", pch = 16)
abline(lm(crop_yield ~ fertilizer, data = data), col = "red")
# ggplot2
ggplot(data, aes(fertilizer, crop_yield)) +
geom_point() +
geom_smooth(method = "lm") +
labs(title = "Crop Yield vs. Fertilizer") +
theme(plot.title = element_text(hjust = 0.5)) +
theme(axis.text.x = element_text(face = "bold", size = 12)) +
theme(axis.text.y = element_text(face = "bold", size = 12)) +
theme(legend.position = "top", legend.title = element_blank())
## `geom_smooth()` using formula 'y ~ x'