# Load necessary libraries
if (!requireNamespace("WDI", quietly = TRUE)) {
  install.packages("WDI")
}
if (!requireNamespace("ggplot2", quietly = TRUE)) {
  install.packages("ggplot2")
}

library(WDI)
library(ggplot2)

# Specify the countries using their ISO-2 codes
country_codes <- c("US", "TR", "CN", "IN", "BR")  # Example country codes

# Load population and agriculture data from WDI
population_data <- WDI(country = country_codes, indicator = "SP.POP.TOTL", start = 2000, end = 2020)
agriculture_data <- WDI(country = country_codes, indicator = "NV.AGR.TOTL.CD", start = 2000, end = 2020)

# Merge the data frames by country and year
data <- merge(population_data, agriculture_data, by = c("iso2c", "year"), suffixes = c("_population", "_agriculture"))

# Clean the data by removing NA values
data <- na.omit(data)

# Calculate statistics
average_population <- mean(data$SP.POP.TOTL)
average_agriculture <- mean(data$NV.AGR.TOTL.CD)

variance_population <- var(data$SP.POP.TOTL)
variance_agriculture <- var(data$NV.AGR.TOTL.CD)

sd_population <- sd(data$SP.POP.TOTL)
sd_agriculture <- sd(data$NV.AGR.TOTL.CD)

covariance <- cov(data$SP.POP.TOTL, data$NV.AGR.TOTL.CD)

# Perform linear regression
model <- lm(NV.AGR.TOTL.CD ~ SP.POP.TOTL, data = data)

# Summary of the regression model
summary(model)
## 
## Call:
## lm(formula = NV.AGR.TOTL.CD ~ SP.POP.TOTL, data = data)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -2.878e+11 -4.959e+10  4.372e+09  3.009e+10  6.364e+11 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.637e+10  2.624e+10   0.624    0.534    
## SP.POP.TOTL 3.387e+02  3.146e+01  10.767   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.758e+11 on 103 degrees of freedom
## Multiple R-squared:  0.5295, Adjusted R-squared:  0.525 
## F-statistic: 115.9 on 1 and 103 DF,  p-value: < 2.2e-16
# Create plots
# 1. Scatter plot with regression line
ggplot(data, aes(x = SP.POP.TOTL, y = NV.AGR.TOTL.CD)) +
  geom_point() +
  geom_smooth(method = "lm", col = "blue") +
  labs(title = "Population vs Agriculture", x = "Total Population", y = "Agricultural Value (CD)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# 2. Bar plots for average values
avg_data <- data.frame(
  Category = c("Average Population", "Average Agriculture"),
  Value = c(average_population, average_agriculture)
)

ggplot(avg_data, aes(x = Category, y = Value, fill = Category)) +
  geom_bar(stat = "identity") +
  labs(title = "Average Population and Agriculture", y = "Value") +
  theme_minimal()

# Print results
cat("Average Population:", average_population, "\n")
## Average Population: 630859470
cat("Average Agriculture:", average_agriculture, "\n")
## Average Agriculture: 230058634678
cat("Variance Population:", variance_population, "\n")
## Variance Population: 3.003832e+17
cat("Variance Agriculture:", variance_agriculture, "\n")
## Variance Agriculture: 6.508112e+22
cat("Standard Deviation Population:", sd_population, "\n")
## Standard Deviation Population: 548072219
cat("Standard Deviation Agriculture:", sd_agriculture, "\n")
## Standard Deviation Agriculture: 2.5511e+11
cat("Covariance:", covariance, "\n")
## Covariance: 1.017456e+20