# Load necessary libraries
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(openxlsx)
## Warning: package 'openxlsx' was built under R version 4.3.3
# Load the CSV and Excel files
csv_data <- read.csv("C:/Users/User/Documents/EVD/Carbon_(CO2)_Emissions_by_Country.csv")
excel_data <- read.xlsx("C:/Users/User/Documents/EVD/hdr-data.xlsx")
# Convert the "Date" column in the CSV file to a "year" column for consistency
# Ensure both `year` columns are integers
csv_data <- csv_data %>%
mutate(Date = as.Date(Date, format = "%m/%d/%Y"))
csv_data <- csv_data %>%
mutate(year = as.integer(format(Date, "%Y")))
excel_data <- excel_data %>%
mutate(year = as.integer(year))
# Perform the inner join again
merged_data <- csv_data %>%
inner_join(excel_data, by = c("Country" = "country", "year" = "year"))
# Remove a column named "column_name"
merged_data <- merged_data %>% select(-Date)
summary(merged_data)
## Country Region Kilotons.of.Co2
## Length:4437 Length:4437 Min. : 0
## Class :character Class :character 1st Qu.: 2130
## Mode :character Mode :character Median : 11690
## Mean : 157581
## 3rd Qu.: 63050
## Max. :10707220
## Metric.Tons.Per.Capita year value
## Min. : 0.000 Min. :1990 Min. :0.2120
## 1st Qu.: 0.590 1st Qu.:1998 1st Qu.:0.5340
## Median : 2.390 Median :2006 Median :0.6880
## Mean : 4.338 Mean :2005 Mean :0.6652
## 3rd Qu.: 6.340 3rd Qu.:2013 3rd Qu.:0.7980
## Max. :47.650 Max. :2019 Max. :0.9610
any(is.na(merged_data))
## [1] FALSE
# Load necessary libraries
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.3.3
# Ensure the dataset is loaded and clean
# merged_data should already exist after the join operation
# Reshape the data for plotting (convert to long format)
long_data <- merged_data %>%
select(`Kilotons.of.Co2`, `Metric.Tons.Per.Capita`, value) %>% # Select only numeric columns
pivot_longer(everything(), names_to = "Variable", values_to = "Value")
# Create the boxplot
ggplot(long_data, aes(x = Variable, y = Value)) +
geom_boxplot() +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) + # Rotate x-axis labels
labs(title = "Boxplots for All Numeric Columns",
x = "Variables",
y = "Values")

# Function to count outliers for a single numeric vector
count_outliers <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE) # First quartile (25th percentile)
Q3 <- quantile(x, 0.75, na.rm = TRUE) # Third quartile (75th percentile)
IQR <- Q3 - Q1 # Interquartile range
# Define lower and upper bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# Count outliers
sum(x < lower_bound | x > upper_bound, na.rm = TRUE)
}
# Apply the function to each numeric column in the dataset
outlier_counts <- merged_data %>%
select(`Kilotons.of.Co2`, `Metric.Tons.Per.Capita`, value) %>% # Select only numeric columns
summarise(across(everything(), count_outliers))
# View the outlier counts for each column
print(outlier_counts)
## Kilotons.of.Co2 Metric.Tons.Per.Capita value
## 1 696 235 0
# Load the ggplot2 library
library(ggplot2)
# Create the scatterplot
ggplot(merged_data, aes(x = `Metric.Tons.Per.Capita`, y = value)) +
geom_point(color = "blue", alpha = 0.7) + # Add points with some transparency
theme_minimal() + # Use a minimal theme for a clean look
labs(
title = "Scatterplot: Metric Tons Per Capita vs Value",
x = "Metric Tons Per Capita",
y = "Value"
) +
theme(
plot.title = element_text(hjust = 0.5, size = 16), # Center the title
axis.title = element_text(size = 14), # Adjust axis title size
axis.text = element_text(size = 12) # Adjust axis text size
)

# Create a histogram using Base R
hist(merged_data$Metric.Tons.Per.Capita,
main = "Histogram of Metric Tons Per Capita",
xlab = "Metric Tons Per Capita",
ylab = "Frequency",
col = "blue",
border = "black")

# Create a histogram using Base R
hist(merged_data$value,
main = "Histogram of Human Development Index",
xlab = "value",
ylab = "Frequency",
col = "blue",
border = "black")

# Apply log transformation to "Metric.Tons.Per.Capita" and "value"
log_transformed_data <- merged_data %>%
mutate(
Metric.Tons.Per.Capita = log(Metric.Tons.Per.Capita + 1), # Adding 1 to handle zero/negative values
value = log(value + 1) # Adding 1 to handle zero/negative values
)
# View the transformed data
head(log_transformed_data)
## Country Region Kilotons.of.Co2 Metric.Tons.Per.Capita year value
## 1 Afghanistan Asia 8930 0.2700271 2011 0.3763795
## 2 Afghanistan Asia 8080 0.2390169 2012 0.3832195
## 3 Afghanistan Asia 7110 0.2231436 2010 0.3708737
## 4 Afghanistan Asia 6080 0.1484200 2019 0.4001175
## 5 Afghanistan Asia 6070 0.1570037 2018 0.3960879
## 6 Afghanistan Asia 5990 0.1739533 2013 0.3886580
# Load dplyr package
library(dplyr)
# Filter merged_data for the year 2018
data_2018 <- log_transformed_data %>%
filter(year == 2018)
# View the filtered data
head(data_2018)
## Country Region Kilotons.of.Co2 Metric.Tons.Per.Capita year value
## 1 Afghanistan Asia 6070 0.1570037 2018 0.3960879
## 2 Albania Europe 5110 1.0224509 2018 0.5861186
## 3 Algeria Africa 165540 1.5993876 2018 0.5538851
## 4 Andorra Europe 490 2.0188950 2018 0.6221881
## 5 Angola Africa 23960 0.5709795 2018 0.4687528
## 6 Argentina Americas 176900 1.6054299 2018 0.6162661
library(dplyr)
# Function to count outliers for a single numeric vector
count_outliers <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE) # First quartile (25th percentile)
Q3 <- quantile(x, 0.75, na.rm = TRUE) # Third quartile (75th percentile)
IQR <- Q3 - Q1 # Interquartile range
# Define lower and upper bounds for outliers
lower_bound <- Q1 - 1.5 * IQR
upper_bound <- Q3 + 1.5 * IQR
# Count outliers
sum(x < lower_bound | x > upper_bound, na.rm = TRUE)
}
# Check for outliers in the specified columns
outlier_counts <- data_2018 %>%
select(where(is.numeric)) %>% # Select only numeric columns
summarise(across(everything(), ~ count_outliers(.)))
# View the outlier counts for each column
print(outlier_counts)
## Kilotons.of.Co2 Metric.Tons.Per.Capita year value
## 1 27 0 0 0
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
# Fit a robust polynomial regression model (degree 3 example)
robust_poly_model <- rlm(value ~ poly(Metric.Tons.Per.Capita, 3), data = data_2018)
# Summarize the model
summary(robust_poly_model)
##
## Call: rlm(formula = value ~ poly(Metric.Tons.Per.Capita, 3), data = data_2018)
## Residuals:
## Min 1Q Median 3Q Max
## -0.10050 -0.02710 0.00422 0.02683 0.07986
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 0.5388 0.0031 176.3457
## poly(Metric.Tons.Per.Capita, 3)1 1.0190 0.0390 26.1226
## poly(Metric.Tons.Per.Capita, 3)2 -0.3996 0.0390 -10.2432
## poly(Metric.Tons.Per.Capita, 3)3 0.0530 0.0390 1.3596
##
## Residual standard error: 0.03991 on 159 degrees of freedom
# Make predictions
data_2018$predicted <- predict(robust_poly_model, newdata = data_2018)
# Visualize the results
library(ggplot2)
ggplot(data_2018, aes(x = Metric.Tons.Per.Capita, y = value)) +
geom_point(color = "blue", alpha = 0.6) +
geom_line(aes(y = predicted), color = "red") +
labs(title = "Robust Polynomial Regression (Degree 3)",
x = "Metric Tons Per Capita", y = "Value")

# Uji Shapiro-Wilk
shapiro.test(residuals(robust_poly_model))
##
## Shapiro-Wilk normality test
##
## data: residuals(robust_poly_model)
## W = 0.98196, p-value = 0.03209
# Histogram dan Q-Q plot
library(ggplot2)
ggplot(data.frame(residuals = residuals(robust_poly_model)), aes(x = residuals)) +
geom_histogram(binwidth = 0.01, fill = "blue", alpha = 0.6) +
labs(title = "Histogram Residual", x = "Residual", y = "Frekuensi")

qqnorm(residuals(robust_poly_model))
qqline(residuals(robust_poly_model), col = "red")

library(lmtest)
## Warning: package 'lmtest' was built under R version 4.3.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.3.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(robust_poly_model) # Uji Breusch-Pagan
##
## studentized Breusch-Pagan test
##
## data: robust_poly_model
## BP = 6.7043, df = 3, p-value = 0.08194
# Plot residual vs. nilai yang diprediksi
ggplot(data.frame(fitted = fitted(robust_poly_model), residuals = residuals(robust_poly_model)),
aes(x = fitted, y = residuals)) +
geom_point(color = "blue", alpha = 0.6) +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = "Residual vs Fitted", x = "Nilai yang Diprediksi", y = "Residual")

dwtest(robust_poly_model)
##
## Durbin-Watson test
##
## data: robust_poly_model
## DW = 2.0068, p-value = 0.5159
## alternative hypothesis: true autocorrelation is greater than 0
# Load necessary library
library(caret)
## Warning: package 'caret' was built under R version 4.3.3
## Loading required package: lattice
# Set seed for reproducibility
set.seed(123)
# Split the data into training (70%) and testing (30%) sets
train_index <- createDataPartition(data_2018$value, p = 0.7, list = FALSE)
train_data <- data_2018[train_index, ]
test_data <- data_2018[-train_index, ]
# Fit the model on the training data
train_model <- lm(value ~ poly(Metric.Tons.Per.Capita, 3), data = train_data)
# Predict on the training data
train_predictions <- predict(train_model, newdata = train_data)
# Predict on the testing data
test_predictions <- predict(train_model, newdata = test_data)
# Calculate RMSE for training and testing data
train_rmse <- sqrt(mean((train_data$value - train_predictions)^2))
test_rmse <- sqrt(mean((test_data$value - test_predictions)^2))
# Output the RMSE values
cat("Training RMSE:", train_rmse, "\n")
## Training RMSE: 0.0369579
cat("Testing RMSE:", test_rmse, "\n")
## Testing RMSE: 0.03799412
# Check for overfitting
if (test_rmse > train_rmse * 1.2) {
cat("The model may be overfitting.\n")
} else {
cat("The model does not appear to be overfitting.\n")
}
## The model does not appear to be overfitting.
# Fit the robust regression model
library(MASS)
robust_poly_model <- rlm(value ~ poly(Metric.Tons.Per.Capita, 3), data = data_2018)
# Calculate R-squared
# Step 1: Predicted values
predicted_values <- predict(robust_poly_model)
# Step 2: Total Sum of Squares (SST)
sst <- sum((data_2018$value - mean(data_2018$value))^2)
# Step 3: Residual Sum of Squares (SSR)
ssr <- sum((data_2018$value - predicted_values)^2)
# Step 4: R-squared
r_squared <- 1 - (ssr / sst)
# Print the R-squared value
cat("R-squared:", r_squared, "\n")
## R-squared: 0.841577
# Calculate Spearman correlation
spearman_corr <- cor(data_2018$value, data_2018$Metric.Tons.Per.Capita, method = "spearman")
# Print the result
cat("Spearman Correlation:", spearman_corr, "\n")
## Spearman Correlation: 0.8731898