#PART 1 - Loading the data and installing essential packages

Install and load necessary packages

install.packages(c(“readxl”, “ggplot2”, “plotly”)) library(readxl) library(ggplot2) library(plotly)

Load the data

data <- read_excel(“/Users/thiagotauyl/Downloads/Data for the project.xlsx”, sheet = “Data”)

#PART 2 - Cleaning the data

Clean data by removing missing values

data <- na.omit(data)

#PART 3 - Creating the model and Analysing the Regression

Fit the initial linear regression model

model <- lm(GDP_growth ~ FDI, data = data)

To plot the regression

Add predicted values to the data

data$predicted <- predict(model)

Plot the original data and the regression line

ggplot(data, aes(x = FDI, y = GDP_growth)) + geom_point(color = “blue”, alpha = 0.7) + # Scatter plot of original data geom_smooth(method = “lm”, se = FALSE, color = “red”) + # Regression line labs(title = “Linear Regression: GDP Growth vs FDI”, x = “FDI”, y = “GDP Growth”) + theme_minimal()

Summary of the initial model

summary(model)

After understanding there might be outliers, we should work on how to treat them

#PART 4 - Identidfying and Handling outliers

Calculate Cook’s Distance and identify outliers

cooks_distance <- cooks.distance(model) data$outlier_dummy <- ifelse(cooks_distance > 1, 1, 0)

Data without outliers

data_no_outliers <- data[cooks_distance <= 1, ]

Plotting regression without outliers

Plot the no outliers data and the regression line

ggplot(data_no_outliers, aes(x = FDI, y = GDP_growth)) + geom_point(color = “blue”, alpha = 0.7) + # Scatter plot of original data geom_smooth(method = “lm”, se = FALSE, color = “red”) + # Regression line labs(title = “Linear Regression: GDP Growth vs FDI”, x = “FDI”, y = “GDP Growth”) + theme_minimal()

Summary of the initial model

summary(model_no_outliers)

Summarize exclusions

excluded_data <- data[cooks_distance > 1, ] # Observations with Cook’s Distance > 1 percentage_excluded <- nrow(excluded_data) / nrow(data) * 100

————————- Assumptions —————-

(1) Linearity; (2) Independence; (3) Homoscedasticity; (4) Normality

(1) Linearity

The Linear Regression already shows no sign of non-linear pattern

(2) Independence

To observe the residuals

Add residuals and fitted values to the data_no_outliers

data_no_outliers\(residuals <- residuals(model)[cooks_distance <= 1] data_no_outliers\)fitted <- fitted(model)[cooks_distance <= 1]

Histogram of residuals (on data without outliers)

ggplot(data_no_outliers, aes(x = residuals)) + geom_histogram(binwidth = 1, fill = “blue”, color = “black”, alpha = 0.7) + labs(title = “Histogram of Residuals (No Outliers)”, x = “Residuals”, y = “Frequency”) + theme_minimal()

(3) Homoscedasticity

ploting the residuals

ggplot(data_no_outliers, aes(x = fitted, y = residuals)) + geom_point() + geom_hline(yintercept = 0, linetype = “dashed”, color = “red”) + labs(title = “Residuals vs Fitted Values”, x = “Fitted values”, y = “Residuals”) + theme_minimal()

(4) Normality

Q-Q plot of residuals

qqnorm(data_no_outliers\(residuals) qqline(data_no_outliers\)residuals, col = “red”)

Does it infringe normality? Yes. The line shifts a bit.

install.packages(“moments”) library(moments)

Calculate Skewness

skewness_value <- skewness(data_no_outliers$residuals)

Calculate Kurtosis

kurtosis_value <- kurtosis(data_no_outliers$residuals)

Add residuals to the data

data_no_outliers$residuals <- residuals(model_no_outliers)

Create a new column for absolute residuals

data_no_outliers\(abs_residuals <- abs(data_no_outliers\)residuals)

Sort the data by absolute residuals in descending order

most_skewed <- data_no_outliers[order(-data_no_outliers$abs_residuals), ]

View the top 5 most skewed observations

head(most_skewed, 5)