Final Project EVD

# Load necessary libraries
library(dplyr)

## Warning: package 'dplyr' was built under R version 4.3.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(openxlsx)

## Warning: package 'openxlsx' was built under R version 4.3.3

# Load the CSV and Excel files
csv_data <- read.csv("C:/Users/User/Documents/EVD/Carbon_(CO2)_Emissions_by_Country.csv")
excel_data <- read.xlsx("C:/Users/User/Documents/EVD/hdr-data.xlsx")

# Convert the "Date" column in the CSV file to a "year" column for consistency
# Ensure both `year` columns are integers
csv_data <- csv_data %>%
  mutate(Date = as.Date(Date, format = "%m/%d/%Y"))

csv_data <- csv_data %>%
  mutate(year = as.integer(format(Date, "%Y")))

excel_data <- excel_data %>%
  mutate(year = as.integer(year))

# Perform the inner join again
merged_data <- csv_data %>%
  inner_join(excel_data, by = c("Country" = "country", "year" = "year"))

# Remove a column named "column_name"
merged_data <- merged_data %>% select(-Date)


summary(merged_data)

##    Country             Region          Kilotons.of.Co2   
##  Length:4437        Length:4437        Min.   :       0  
##  Class :character   Class :character   1st Qu.:    2130  
##  Mode  :character   Mode  :character   Median :   11690  
##                                        Mean   :  157581  
##                                        3rd Qu.:   63050  
##                                        Max.   :10707220  
##  Metric.Tons.Per.Capita      year          value       
##  Min.   : 0.000         Min.   :1990   Min.   :0.2120  
##  1st Qu.: 0.590         1st Qu.:1998   1st Qu.:0.5340  
##  Median : 2.390         Median :2006   Median :0.6880  
##  Mean   : 4.338         Mean   :2005   Mean   :0.6652  
##  3rd Qu.: 6.340         3rd Qu.:2013   3rd Qu.:0.7980  
##  Max.   :47.650         Max.   :2019   Max.   :0.9610

any(is.na(merged_data))

## [1] FALSE

# Load necessary libraries
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.3.3

library(tidyr)

## Warning: package 'tidyr' was built under R version 4.3.3

# Ensure the dataset is loaded and clean
# merged_data should already exist after the join operation

# Reshape the data for plotting (convert to long format)
long_data <- merged_data %>%
  select(`Kilotons.of.Co2`, `Metric.Tons.Per.Capita`, value) %>%  # Select only numeric columns
  pivot_longer(everything(), names_to = "Variable", values_to = "Value")

# Create the boxplot
ggplot(long_data, aes(x = Variable, y = Value)) +
  geom_boxplot() +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +  # Rotate x-axis labels
  labs(title = "Boxplots for All Numeric Columns",
       x = "Variables",
       y = "Values")

# Function to count outliers for a single numeric vector
count_outliers <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)  # First quartile (25th percentile)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)  # Third quartile (75th percentile)
  IQR <- Q3 - Q1                         # Interquartile range
  
  # Define lower and upper bounds for outliers
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  
  # Count outliers
  sum(x < lower_bound | x > upper_bound, na.rm = TRUE)
}

# Apply the function to each numeric column in the dataset
outlier_counts <- merged_data %>%
  select(`Kilotons.of.Co2`, `Metric.Tons.Per.Capita`, value) %>%  # Select only numeric columns
  summarise(across(everything(), count_outliers))

# View the outlier counts for each column
print(outlier_counts)

##   Kilotons.of.Co2 Metric.Tons.Per.Capita value
## 1             696                    235     0

# Load the ggplot2 library
library(ggplot2)

# Create the scatterplot
ggplot(merged_data, aes(x = `Metric.Tons.Per.Capita`, y = value)) +
  geom_point(color = "blue", alpha = 0.7) +  # Add points with some transparency
  theme_minimal() +  # Use a minimal theme for a clean look
  labs(
    title = "Scatterplot: Metric Tons Per Capita vs Value",
    x = "Metric Tons Per Capita",
    y = "Value"
  ) +
  theme(
    plot.title = element_text(hjust = 0.5, size = 16),  # Center the title
    axis.title = element_text(size = 14),  # Adjust axis title size
    axis.text = element_text(size = 12)  # Adjust axis text size
  )

# Create a histogram using Base R
hist(merged_data$Metric.Tons.Per.Capita,
     main = "Histogram of Metric Tons Per Capita",
     xlab = "Metric Tons Per Capita",
     ylab = "Frequency",
     col = "blue",
     border = "black")

# Create a histogram using Base R
hist(merged_data$value,
     main = "Histogram of Human Development Index",
     xlab = "value",
     ylab = "Frequency",
     col = "blue",
     border = "black")

# Apply log transformation to "Metric.Tons.Per.Capita" and "value"
log_transformed_data <- merged_data %>%
  mutate(
    Metric.Tons.Per.Capita = log(Metric.Tons.Per.Capita + 1),  # Adding 1 to handle zero/negative values
    value = log(value + 1)  # Adding 1 to handle zero/negative values
  )

# View the transformed data
head(log_transformed_data)

##       Country Region Kilotons.of.Co2 Metric.Tons.Per.Capita year     value
## 1 Afghanistan   Asia            8930              0.2700271 2011 0.3763795
## 2 Afghanistan   Asia            8080              0.2390169 2012 0.3832195
## 3 Afghanistan   Asia            7110              0.2231436 2010 0.3708737
## 4 Afghanistan   Asia            6080              0.1484200 2019 0.4001175
## 5 Afghanistan   Asia            6070              0.1570037 2018 0.3960879
## 6 Afghanistan   Asia            5990              0.1739533 2013 0.3886580

# Load dplyr package
library(dplyr)

# Filter merged_data for the year 2018
data_2018 <- log_transformed_data %>%
  filter(year == 2018)

# View the filtered data
head(data_2018)

##       Country   Region Kilotons.of.Co2 Metric.Tons.Per.Capita year     value
## 1 Afghanistan     Asia            6070              0.1570037 2018 0.3960879
## 2     Albania   Europe            5110              1.0224509 2018 0.5861186
## 3     Algeria   Africa          165540              1.5993876 2018 0.5538851
## 4     Andorra   Europe             490              2.0188950 2018 0.6221881
## 5      Angola   Africa           23960              0.5709795 2018 0.4687528
## 6   Argentina Americas          176900              1.6054299 2018 0.6162661

library(dplyr)

# Function to count outliers for a single numeric vector
count_outliers <- function(x) {
  Q1 <- quantile(x, 0.25, na.rm = TRUE)  # First quartile (25th percentile)
  Q3 <- quantile(x, 0.75, na.rm = TRUE)  # Third quartile (75th percentile)
  IQR <- Q3 - Q1                         # Interquartile range
  
  # Define lower and upper bounds for outliers
  lower_bound <- Q1 - 1.5 * IQR
  upper_bound <- Q3 + 1.5 * IQR
  
  # Count outliers
  sum(x < lower_bound | x > upper_bound, na.rm = TRUE)
}

# Check for outliers in the specified columns
outlier_counts <- data_2018 %>%
  select(where(is.numeric)) %>%  # Select only numeric columns
  summarise(across(everything(), ~ count_outliers(.)))


# View the outlier counts for each column
print(outlier_counts)

##   Kilotons.of.Co2 Metric.Tons.Per.Capita year value
## 1              27                      0    0     0

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

# Fit a robust polynomial regression model (degree 3 example)
robust_poly_model <- rlm(value ~ poly(Metric.Tons.Per.Capita, 3), data = data_2018)

# Summarize the model
summary(robust_poly_model)

## 
## Call: rlm(formula = value ~ poly(Metric.Tons.Per.Capita, 3), data = data_2018)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.10050 -0.02710  0.00422  0.02683  0.07986 
## 
## Coefficients:
##                                  Value    Std. Error t value 
## (Intercept)                        0.5388   0.0031   176.3457
## poly(Metric.Tons.Per.Capita, 3)1   1.0190   0.0390    26.1226
## poly(Metric.Tons.Per.Capita, 3)2  -0.3996   0.0390   -10.2432
## poly(Metric.Tons.Per.Capita, 3)3   0.0530   0.0390     1.3596
## 
## Residual standard error: 0.03991 on 159 degrees of freedom

# Make predictions
data_2018$predicted <- predict(robust_poly_model, newdata = data_2018)

# Visualize the results
library(ggplot2)
ggplot(data_2018, aes(x = Metric.Tons.Per.Capita, y = value)) +
  geom_point(color = "blue", alpha = 0.6) +
  geom_line(aes(y = predicted), color = "red") +
  labs(title = "Robust Polynomial Regression (Degree 3)",
       x = "Metric Tons Per Capita", y = "Value")

# Uji Shapiro-Wilk
shapiro.test(residuals(robust_poly_model))

## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(robust_poly_model)
## W = 0.98196, p-value = 0.03209

# Histogram dan Q-Q plot
library(ggplot2)
ggplot(data.frame(residuals = residuals(robust_poly_model)), aes(x = residuals)) +
  geom_histogram(binwidth = 0.01, fill = "blue", alpha = 0.6) +
  labs(title = "Histogram Residual", x = "Residual", y = "Frekuensi")

qqnorm(residuals(robust_poly_model))
qqline(residuals(robust_poly_model), col = "red")

library(lmtest)

## Warning: package 'lmtest' was built under R version 4.3.3

## Loading required package: zoo

## Warning: package 'zoo' was built under R version 4.3.3

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

bptest(robust_poly_model)  # Uji Breusch-Pagan

## 
##  studentized Breusch-Pagan test
## 
## data:  robust_poly_model
## BP = 6.7043, df = 3, p-value = 0.08194

# Plot residual vs. nilai yang diprediksi
ggplot(data.frame(fitted = fitted(robust_poly_model), residuals = residuals(robust_poly_model)),
       aes(x = fitted, y = residuals)) +
  geom_point(color = "blue", alpha = 0.6) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
  labs(title = "Residual vs Fitted", x = "Nilai yang Diprediksi", y = "Residual")

dwtest(robust_poly_model)

## 
##  Durbin-Watson test
## 
## data:  robust_poly_model
## DW = 2.0068, p-value = 0.5159
## alternative hypothesis: true autocorrelation is greater than 0

# Load necessary library
library(caret)

## Warning: package 'caret' was built under R version 4.3.3

## Loading required package: lattice

# Set seed for reproducibility
set.seed(123)

# Split the data into training (70%) and testing (30%) sets
train_index <- createDataPartition(data_2018$value, p = 0.7, list = FALSE)
train_data <- data_2018[train_index, ]
test_data <- data_2018[-train_index, ]

# Fit the model on the training data
train_model <- lm(value ~ poly(Metric.Tons.Per.Capita, 3), data = train_data)

# Predict on the training data
train_predictions <- predict(train_model, newdata = train_data)

# Predict on the testing data
test_predictions <- predict(train_model, newdata = test_data)

# Calculate RMSE for training and testing data
train_rmse <- sqrt(mean((train_data$value - train_predictions)^2))
test_rmse <- sqrt(mean((test_data$value - test_predictions)^2))

# Output the RMSE values
cat("Training RMSE:", train_rmse, "\n")

## Training RMSE: 0.0369579

cat("Testing RMSE:", test_rmse, "\n")

## Testing RMSE: 0.03799412

# Check for overfitting
if (test_rmse > train_rmse * 1.2) {
  cat("The model may be overfitting.\n")
} else {
  cat("The model does not appear to be overfitting.\n")
}

## The model does not appear to be overfitting.

# Fit the robust regression model
library(MASS)
robust_poly_model <- rlm(value ~ poly(Metric.Tons.Per.Capita, 3), data = data_2018)

# Calculate R-squared
# Step 1: Predicted values
predicted_values <- predict(robust_poly_model)

# Step 2: Total Sum of Squares (SST)
sst <- sum((data_2018$value - mean(data_2018$value))^2)

# Step 3: Residual Sum of Squares (SSR)
ssr <- sum((data_2018$value - predicted_values)^2)

# Step 4: R-squared
r_squared <- 1 - (ssr / sst)

# Print the R-squared value
cat("R-squared:", r_squared, "\n")

## R-squared: 0.841577

# Calculate Spearman correlation
spearman_corr <- cor(data_2018$value, data_2018$Metric.Tons.Per.Capita, method = "spearman")

# Print the result
cat("Spearman Correlation:", spearman_corr, "\n")

## Spearman Correlation: 0.8731898

Final Project EVD

Joshua Giovanni Mulyanto

2024-12-04