library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(glue)
data = read.csv("WHR20_Data.csv")
We found strong linear correlation between the variables; Logged.GDP & Happiness
Pearson’s correlation = 0.78
p-value = 5.98e-32
# Calculate Pearson's r & p-value
result = cor.test(data$Logged.GDP, data$Happiness)
text_label = glue("R = {round(result$estimate, 2)}, p = {signif(result$p.value, 3)}")
# Plot Happiness as a function of Logged.GDP
p = data %>%
ggplot(aes(x = Logged.GDP, y = Happiness)) + # Defined X as Logged.GDP and Y as Happiness
geom_point() + # Add scatter plot layer
geom_smooth(method = "lm") + # Add linear regression line
geom_text(aes(x = Inf, y = Inf, label = text_label), hjust = 1.1, vjust = 2, size = 4, check_overlap = TRUE) + # Add Pearson's r & p-value to the plot
theme_light()
p
## `geom_smooth()` using formula = 'y ~ x'
In the following code; y_tag represent our prediction based on our model coefficients
SE calculate our predictions squared errors
We return the mean of squared error in all our predictions AKA MSE
# Calculate MSE
# Arguments:
# y = vector of y
# x1 = vector of x1
# b0 = intercept
# b1 = slope
MSE = function(y, x1, b0, b1){
y_tag = b0 + b1 *x1
SE = (y-y_tag) ^2
return(mean(SE))
}
The following function calculate the R2 using SSM/SST
# Calculate R2
# Arguments:
# y_tag = predictions vector
# SSM = sum of squared error of model - שונות מוסברת
# SST = sum of squared error of data - שונות לא מוסברת
# R2 = SSM/SST
R2 = function(data, y_tag){
SSM = sum((y_tag-mean(data))^2)
SST = sum((data-mean(data))^2)
return (SSM/SST)
}
b_min = -5
b_max = 5
b_interval = 0.1
# Create a vector with the min max and interval
space = seq(b_min, b_max, b_interval)
space
## [1] -5.0 -4.9 -4.8 -4.7 -4.6 -4.5 -4.4 -4.3 -4.2 -4.1 -4.0 -3.9 -3.8 -3.7 -3.6
## [16] -3.5 -3.4 -3.3 -3.2 -3.1 -3.0 -2.9 -2.8 -2.7 -2.6 -2.5 -2.4 -2.3 -2.2 -2.1
## [31] -2.0 -1.9 -1.8 -1.7 -1.6 -1.5 -1.4 -1.3 -1.2 -1.1 -1.0 -0.9 -0.8 -0.7 -0.6
## [46] -0.5 -0.4 -0.3 -0.2 -0.1 0.0 0.1 0.2 0.3 0.4 0.5 0.6 0.7 0.8 0.9
## [61] 1.0 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 2.0 2.1 2.2 2.3 2.4
## [76] 2.5 2.6 2.7 2.8 2.9 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9
## [91] 4.0 4.1 4.2 4.3 4.4 4.5 4.6 4.7 4.8 4.9 5.0
param = expand.grid(b0 = space, b1 = space)
param$mse = NA
mse_values = numeric(nrow(param))
# Define different pair every iteration
for(i in 1:nrow(param)){
b0 = param$b0[i]
b1 = param$b1[i]
# make a y_tag vector
predicted_happiness <- b0 + b1 * data$Logged.GDP
# Calculate the residuals (difference between actual and predicted values)
residuals <- data$Happiness - predicted_happiness
mse <- mean(residuals^2)
mse_values[i] = mse
}
param$mse = mse_values
min_mse_index = which.min(param$mse)
b0_estimated = param$b0[min_mse_index] # ml value of intercept
b1_estimated = param$b1[min_mse_index] # ml value of slope
lowest_mse = param$mse[min_mse_index] # minimum mse
p = param %>% ggplot(aes(x = b0, y = b1, fill = mse)) +
geom_tile()
p
### 11. Plot log(mse)
By using log(mse) we can visual a line that represent the impact of different pairs of b0 & b1 on MSE values
p = param %>% ggplot(aes(x = b0, y = b1, fill = log(mse))) +
geom_tile()
p
# Define the fittest b0 & b1 and use them to predict happiness by Logged.GDP (y' vector)
predictedHappiness = b0_estimated + b1_estimated*data$Logged.GDP
R2_Happiness = R2(data$Happiness, predictedHappiness)
R2_Happiness
## [1] 0.5727828
# Create a linear model to predict Happiness by Logged.GDP
m1 = lm(Happiness ~ Logged.GDP, data)
We found similarity between our estimated parameter to the model coefficient:
b0_estimated
## [1] -1
b1_estimated
## [1] 0.7
coef_model = coef(m1)
coef_model
## (Intercept) Logged.GDP
## -1.1986461 0.7177385
We found similarity between our R2 estimation to the model R2
R2 estimated = 0.5727
Model R2(adjusted) = 0.5986
summary(m1)
##
## Call:
## lm(formula = Happiness ~ Logged.GDP, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.29256 -0.52524 0.02843 0.57109 1.38802
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.19865 0.44586 -2.688 0.00799 **
## Logged.GDP 0.71774 0.04757 15.088 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7047 on 151 degrees of freedom
## Multiple R-squared: 0.6012, Adjusted R-squared: 0.5986
## F-statistic: 227.6 on 1 and 151 DF, p-value: < 2.2e-16
scale function uses to center and scaling data, it might be use to normalize parameters and get Pearson’s r
The zero value represents the mean value of happiness
m1scaled = lm(scale(Happiness) ~ scale(Logged.GDP), data)
coefficients(m1scaled)
## (Intercept) scale(Logged.GDP)
## 4.469591e-16 7.753744e-01
Scaled parameters is identical to Pearson correlation:
cor(scale(data$Happiness), scale(data$Logged.GDP))
## [,1]
## [1,] 0.7753744
We found identical values of p-value & Pearson’s r for both models:
# install.packages("lm.beta") # use only one time
library(lm.beta)
m1beta = lm.beta(m1)
m1scaledbeta = lm.beta(m1scaled)
summary(m1beta)
##
## Call:
## lm(formula = Happiness ~ Logged.GDP, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.29256 -0.52524 0.02843 0.57109 1.38802
##
## Coefficients:
## Estimate Standardized Std. Error t value Pr(>|t|)
## (Intercept) -1.19865 NA 0.44586 -2.688 0.00799 **
## Logged.GDP 0.71774 0.77537 0.04757 15.088 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.7047 on 151 degrees of freedom
## Multiple R-squared: 0.6012, Adjusted R-squared: 0.5986
## F-statistic: 227.6 on 1 and 151 DF, p-value: < 2.2e-16
summary(m1scaledbeta)
##
## Call:
## lm(formula = scale(Happiness) ~ scale(Logged.GDP), data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.06115 -0.47222 0.02556 0.51344 1.24791
##
## Coefficients:
## Estimate Standardized Std. Error t value Pr(>|t|)
## (Intercept) 4.470e-16 NA 5.122e-02 0.00 1
## scale(Logged.GDP) 7.754e-01 7.754e-01 5.139e-02 15.09 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6336 on 151 degrees of freedom
## Multiple R-squared: 0.6012, Adjusted R-squared: 0.5986
## F-statistic: 227.6 on 1 and 151 DF, p-value: < 2.2e-16