advanced-statistics

Including Plots

You can also embed plots, for example:

# Set seed for reproducibility
set.seed(123)

# Generate synthetic data
n <- 100
soil_quality <- rnorm(n, mean=5, sd=2)
rain <- rnorm(n, mean=100, sd=20)
pesticide <- rnorm(n, mean=3, sd=1)
fertilizer <- rnorm(n, mean=50, sd=10)
labor <- rnorm(n, mean=20, sd=5)
crop_yield <- 10 + 2 * soil_quality + 0.3 * rain + 1.5 * pesticide + 0.5 * fertilizer + 0.8 * labor + rnorm(n, mean=0, sd=5)
data <- data.frame(crop_yield, soil_quality, rain, pesticide, fertilizer, labor)

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

# Kernel density estimation for conditional density
kde <- MASS::kde2d(data$soil_quality, data$crop_yield, n=50)

# Plot the conditional density
filled.contour(kde, color.palette=colorRampPalette(c("white", "blue", "red")), 
               xlab="Soil Quality", ylab="Crop Yield", main="Conditional Density of Crop Yield Given Soil Quality")

# Perform PCA on the dataset
data_pca <- prcomp(data, scale. = TRUE)

# Plot the PCA results
pca_df <- data.frame(data_pca$x)
ggplot(pca_df, aes(x = PC1, y = PC2)) +
  geom_point() +
  ggtitle("PCA of Crop Yield Data") +
  xlab("Principal Component 1") +
  ylab("Principal Component 2")

# Perform k-means clustering
set.seed(123)
kmeans_result <- kmeans(scale(data), centers = 3)

# Add cluster information to the PCA data frame
pca_df$cluster <- as.factor(kmeans_result$cluster)

# Plot the clustering results on the PCA
ggplot(pca_df, aes(x = PC1, y = PC2, color = cluster)) +
  geom_point() +
  ggtitle("K-means Clustering on PCA of Crop Yield Data") +
  xlab("Principal Component 1") +
  ylab("Principal Component 2")

# Silhouette Analysis
dist_mat <- dist(scale(data))  # Using the full data distance matrix

# Check if silhouette object is created correctly
sil <- silhouette(kmeans_result$cluster, dist_mat)
print(summary(sil))  # Print summary to check if silhouette object is valid

## Silhouette of 100 units in 3 clusters from silhouette.default(x = kmeans_result$cluster, dist = dist_mat) :
##  Cluster sizes and average silhouette widths:
##        38        34        28 
## 0.1741499 0.1765157 0.1411486 
## Individual silhouette widths:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## 0.00723 0.07491 0.16593 0.16571 0.24573 0.38606

# Plot the silhouette analysis
plot(sil, main = "Silhouette Plot for K-means Clustering", col = 1:3, border = NA)

# Additional PCA and clustering plot
ggplot(pca_df, aes(x = PC1, y = PC2, color = cluster)) +
  geom_point(size = 2) +
  theme_minimal() +
  labs(title = "Clustering Results on Principal Components", 
       x = "Principal Component 1", 
       y = "Principal Component 2",
       color = "Cluster")

# Plotting Crop Yield vs. Fertilizer
# Base R
plot(data$fertilizer, data$crop_yield,
     main = "Crop Yield vs. Fertilizer",
     xlab = "Fertilizer", ylab = "Crop Yield",
     bg = "black", pch = 16)
abline(lm(crop_yield ~ fertilizer, data = data), col = "red")

# ggplot2
ggplot(data, aes(fertilizer, crop_yield)) +
  geom_point() +
  geom_smooth(method = "lm") +
  labs(title = "Crop Yield vs. Fertilizer") +
  theme(plot.title = element_text(hjust = 0.5)) +
  theme(axis.text.x = element_text(face = "bold", size = 12)) +
  theme(axis.text.y = element_text(face = "bold", size = 12)) +
  theme(legend.position = "top", legend.title = element_blank())

## `geom_smooth()` using formula 'y ~ x'

advanced-statistics

Nastaran Najkar-دکتر نسترن ناجکار

2024-06-25

Including Plots