#PART 1 - Loading the data and installing essential packages
install.packages(c(“readxl”, “ggplot2”, “plotly”)) library(readxl) library(ggplot2) library(plotly)
data <- read_excel(“/Users/thiagotauyl/Downloads/Data for the project.xlsx”, sheet = “Data”)
#PART 2 - Cleaning the data
data <- na.omit(data)
#PART 3 - Creating the model and Analysing the Regression
model <- lm(GDP_growth ~ FDI, data = data)
data$predicted <- predict(model)
ggplot(data, aes(x = FDI, y = GDP_growth)) + geom_point(color = “blue”, alpha = 0.7) + # Scatter plot of original data geom_smooth(method = “lm”, se = FALSE, color = “red”) + # Regression line labs(title = “Linear Regression: GDP Growth vs FDI”, x = “FDI”, y = “GDP Growth”) + theme_minimal()
summary(model)
#PART 4 - Identidfying and Handling outliers
cooks_distance <- cooks.distance(model) data$outlier_dummy <- ifelse(cooks_distance > 1, 1, 0)
data_no_outliers <- data[cooks_distance <= 1, ]
ggplot(data_no_outliers, aes(x = FDI, y = GDP_growth)) + geom_point(color = “blue”, alpha = 0.7) + # Scatter plot of original data geom_smooth(method = “lm”, se = FALSE, color = “red”) + # Regression line labs(title = “Linear Regression: GDP Growth vs FDI”, x = “FDI”, y = “GDP Growth”) + theme_minimal()
summary(model_no_outliers)
excluded_data <- data[cooks_distance > 1, ] # Observations with Cook’s Distance > 1 percentage_excluded <- nrow(excluded_data) / nrow(data) * 100
cat(“Percentage of data excluded due to outliers:”, round(percentage_excluded, 2), “%”) cat(“Excluded Observations:”) print(excluded_data[, c(“Country”, “FDI”, “GDP_growth”)])
#PART 5 Validating Assumptions (without outliers)
data_no_outliers\(residuals <- residuals(model)[cooks_distance <= 1] data_no_outliers\)fitted <- fitted(model)[cooks_distance <= 1]
ggplot(data_no_outliers, aes(x = residuals)) + geom_histogram(binwidth = 1, fill = “blue”, color = “black”, alpha = 0.7) + labs(title = “Histogram of Residuals (No Outliers)”, x = “Residuals”, y = “Frequency”) + theme_minimal()
ggplot(data_no_outliers, aes(x = fitted, y = residuals)) + geom_point() + geom_hline(yintercept = 0, linetype = “dashed”, color = “red”) + labs(title = “Residuals vs Fitted Values”, x = “Fitted values”, y = “Residuals”) + theme_minimal()
qqnorm(data_no_outliers\(residuals) qqline(data_no_outliers\)residuals, col = “red”)
install.packages(“moments”) library(moments)
skewness_value <- skewness(data_no_outliers$residuals)
kurtosis_value <- kurtosis(data_no_outliers$residuals)
cat(“Skewness:”, skewness_value, “”) cat(“Kurtosis:”, kurtosis_value, “”)
#Checking the most skewed observations
data_no_outliers$residuals <- residuals(model_no_outliers)
data_no_outliers\(abs_residuals <- abs(data_no_outliers\)residuals)
most_skewed <- data_no_outliers[order(-data_no_outliers$abs_residuals), ]
head(most_skewed, 5)