Introduction to linear regression

Overview

In this lab, you’ll be analyzing data from Human Freedom Index reports from 2008-2016. Your aim will be to summarize a few of the relationships within the data both graphically and numerically in order to find which variables can help tell a story about freedom. The data we’re working with is in the openintro package and it’s called hfi, short for Human Freedom Index.

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(openintro)

## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata

library(statar)
data('hfi', package='openintro')

Exercise 1

What are the dimensions of the dataset? Human Freedom Index (hfi) data set has 1458 observations and 123 variables.

dim(hfi)

## [1] 1458  123

names(hfi)

##   [1] "year"                               "ISO_code"                          
##   [3] "countries"                          "region"                            
##   [5] "pf_rol_procedural"                  "pf_rol_civil"                      
##   [7] "pf_rol_criminal"                    "pf_rol"                            
##   [9] "pf_ss_homicide"                     "pf_ss_disappearances_disap"        
##  [11] "pf_ss_disappearances_violent"       "pf_ss_disappearances_organized"    
##  [13] "pf_ss_disappearances_fatalities"    "pf_ss_disappearances_injuries"     
##  [15] "pf_ss_disappearances"               "pf_ss_women_fgm"                   
##  [17] "pf_ss_women_missing"                "pf_ss_women_inheritance_widows"    
##  [19] "pf_ss_women_inheritance_daughters"  "pf_ss_women_inheritance"           
##  [21] "pf_ss_women"                        "pf_ss"                             
##  [23] "pf_movement_domestic"               "pf_movement_foreign"               
##  [25] "pf_movement_women"                  "pf_movement"                       
##  [27] "pf_religion_estop_establish"        "pf_religion_estop_operate"         
##  [29] "pf_religion_estop"                  "pf_religion_harassment"            
##  [31] "pf_religion_restrictions"           "pf_religion"                       
##  [33] "pf_association_association"         "pf_association_assembly"           
##  [35] "pf_association_political_establish" "pf_association_political_operate"  
##  [37] "pf_association_political"           "pf_association_prof_establish"     
##  [39] "pf_association_prof_operate"        "pf_association_prof"               
##  [41] "pf_association_sport_establish"     "pf_association_sport_operate"      
##  [43] "pf_association_sport"               "pf_association"                    
##  [45] "pf_expression_killed"               "pf_expression_jailed"              
##  [47] "pf_expression_influence"            "pf_expression_control"             
##  [49] "pf_expression_cable"                "pf_expression_newspapers"          
##  [51] "pf_expression_internet"             "pf_expression"                     
##  [53] "pf_identity_legal"                  "pf_identity_parental_marriage"     
##  [55] "pf_identity_parental_divorce"       "pf_identity_parental"              
##  [57] "pf_identity_sex_male"               "pf_identity_sex_female"            
##  [59] "pf_identity_sex"                    "pf_identity_divorce"               
##  [61] "pf_identity"                        "pf_score"                          
##  [63] "pf_rank"                            "ef_government_consumption"         
##  [65] "ef_government_transfers"            "ef_government_enterprises"         
##  [67] "ef_government_tax_income"           "ef_government_tax_payroll"         
##  [69] "ef_government_tax"                  "ef_government"                     
##  [71] "ef_legal_judicial"                  "ef_legal_courts"                   
##  [73] "ef_legal_protection"                "ef_legal_military"                 
##  [75] "ef_legal_integrity"                 "ef_legal_enforcement"              
##  [77] "ef_legal_restrictions"              "ef_legal_police"                   
##  [79] "ef_legal_crime"                     "ef_legal_gender"                   
##  [81] "ef_legal"                           "ef_money_growth"                   
##  [83] "ef_money_sd"                        "ef_money_inflation"                
##  [85] "ef_money_currency"                  "ef_money"                          
##  [87] "ef_trade_tariffs_revenue"           "ef_trade_tariffs_mean"             
##  [89] "ef_trade_tariffs_sd"                "ef_trade_tariffs"                  
##  [91] "ef_trade_regulatory_nontariff"      "ef_trade_regulatory_compliance"    
##  [93] "ef_trade_regulatory"                "ef_trade_black"                    
##  [95] "ef_trade_movement_foreign"          "ef_trade_movement_capital"         
##  [97] "ef_trade_movement_visit"            "ef_trade_movement"                 
##  [99] "ef_trade"                           "ef_regulation_credit_ownership"    
## [101] "ef_regulation_credit_private"       "ef_regulation_credit_interest"     
## [103] "ef_regulation_credit"               "ef_regulation_labor_minwage"       
## [105] "ef_regulation_labor_firing"         "ef_regulation_labor_bargain"       
## [107] "ef_regulation_labor_hours"          "ef_regulation_labor_dismissal"     
## [109] "ef_regulation_labor_conscription"   "ef_regulation_labor"               
## [111] "ef_regulation_business_adm"         "ef_regulation_business_bureaucracy"
## [113] "ef_regulation_business_start"       "ef_regulation_business_bribes"     
## [115] "ef_regulation_business_licensing"   "ef_regulation_business_compliance" 
## [117] "ef_regulation_business"             "ef_regulation"                     
## [119] "ef_score"                           "ef_rank"                           
## [121] "hf_score"                           "hf_rank"                           
## [123] "hf_quartile"

Exercise 2

What type of plot would you use to display the relationship between the personal freedom score, pf_score, and one of the other numerical variables? Plot this relationship using the variable pf_expression_control as the predictor. Does the relationship look linear? If you knew a country’s pf_expression_control, or its score out of 10, with 0 being the most, of political pressures and controls on media content, would you be comfortable using a linear model to predict the personal freedom score?

The scattered plot shows a pattern of linear relationship or data points follow a straight line, then using a linear model is appropriate.

plot(hfi$pf_expression_control, hfi$pf_score, 
     xlab = "Political Pressures and Media Control Score",
     ylab = "Personal Freedom Score",
     col = "skyblue",
     main = "Scatter Plot of pf_expression_control vs. pf_score")

# Fit a linear model
lm_model <- lm(pf_score ~ pf_expression_control, data = hfi)

# Add the regression line
abline(lm_model, col = "red")

Exercise 3

Looking at your plot from the previous exercise, describe the relationship between these two variables. Make sure to discuss the form, direction, and strength of the relationship as well as any unusual observations.

The direction of the relationship is positive This means that as the pf_expression_control score (indicating more political pressures and controls on media content) increases, the pf_score (Personal Freedom Score) tends to increases In other words, countries with higher levels of political pressures on media tend to have larger personal freedom scores. The strength of the relationship looks very strong by looking at how tightly the data points cluster around the regression line because the data points are closely grouped around the line.

hfi %>%
  summarise(cor(pf_expression_control, pf_score, use = "complete.obs"))

## # A tibble: 1 × 1
##   `cor(pf_expression_control, pf_score, use = "complete.obs")`
##                                                          <dbl>
## 1                                                        0.796

# This will only work interactively (i.e. will not show in the knitted document)
hfi <- hfi %>% filter(complete.cases(pf_expression_control, pf_score))
DATA606::plot_ss(x = hfi$pf_expression_control, y = hfi$pf_score)

## Click two points to make a line.
                                
## Call:
## lm(formula = y ~ x, data = pts)
## 
## Coefficients:
## (Intercept)            x  
##      4.6171       0.4914  
## 
## Sum of Squares:  952.153

DATA606::plot_ss(x = hfi$pf_expression_control, y = hfi$pf_score, showSquares = TRUE)

## Click two points to make a line.
                                
## Call:
## lm(formula = y ~ x, data = pts)
## 
## Coefficients:
## (Intercept)            x  
##      4.6171       0.4914  
## 
## Sum of Squares:  952.153

Exercise 4

Using plot_ss, choose a line that does a good job of minimizing the sum of squares. Run the function several times. What was the smallest sum of squares that you got? How does it compare to your neighbors? The rss variable showsthe residual sum of squares (RSS) for the linear regression model of correlation plot

# Get the residual sum of squares (RSS) 
RSS <- sum(lm_model$residuals^2)
RSS

## [1] 952.1532

Exercise 5

Fit a new model that uses pf_expression_control to predict hf_score, or the total human freedom score. Using the estimates from the R output, write the equation of the regression line. What does the slope tell us in the context of the relationship between human freedom and the amount of political pressure on media content?

The slope of regression line is positive, it means that, higher political pressure is related to greater levels of human freedom. The positive slope signifies that there is an direct relationship between political pressure on media content and human freedom in the hfi dataset.

plot(hfi$pf_expression_control, hfi$hf_score, 
     xlab = "Political Pressures and Media Control Score",
     ylab = "Total Human Freedom Score",
     col = "green",
     main = "Scatter Plot of pf_expression_control vs. hf_score")

# Fit a linear model
lm_model <- lm(hf_score ~ pf_expression_control, data = hfi)

# Add the regression line
abline(lm_model, col = "black")

Exercise 6

If someone saw the least squares regression line and not the actual data, how would they predict a country’s personal freedom school for one with a 6.7 rating for pf_expression_control? Is this an overestimate or an underestimate, and by how much? In other words, what is the residual for this prediction?

Answer:

The equation of least squares regression line for the linear model is: \[ \hat{y}=4.61707+0.49143 \times p f \_ \text {expression_control } \]

Predicted value of a country’s pf_score with pf_expression_control rating of 6.7 is \[ \hat{y}=4.61707+0.49143 \times 6.7 \approx 7.92145 \]

Residual is the difference between the predicted score and the actual score. If the actual score is higher than 7.92145, the prediction is an underestimate; if it’s lower, the prediction is an overestimate.

m1 <- lm(pf_score ~ pf_expression_control, data = hfi)
summary(m1)

## 
## Call:
## lm(formula = pf_score ~ pf_expression_control, data = hfi)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8467 -0.5704  0.1452  0.6066  3.2060 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            4.61707    0.05745   80.36   <2e-16 ***
## pf_expression_control  0.49143    0.01006   48.85   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8318 on 1376 degrees of freedom
## Multiple R-squared:  0.6342, Adjusted R-squared:  0.634 
## F-statistic:  2386 on 1 and 1376 DF,  p-value: < 2.2e-16

ggplot(data = hfi, aes(x = pf_expression_control, y = pf_score)) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula = 'y ~ x'

Exercise 7

Is there any apparent pattern in the residuals plot? What does this indicate about the linearity of the relationship between the two variables?

# Create a residuals plot
ggplot(data = m1, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  xlab("Fitted values") +
  ylab("Residuals")

# Create a QQ plot
ggplot(data = m1, aes(sample = .resid)) +
  stat_qq()

The residuals plot helps us assess linearity and identify any patterns or deviations. It shows that residuals are randomly scattered around zero, the relationship between two variables are linear.The QQ plot helps assess the normality of the residuals. the points in the QQ plot closely follow a straight line, that means the residuals are approximately normally distributed, and fit with linear regression model.

Exercise 8

Based on the histogram and the normal probability plot, does the nearly normal residuals condition appear to be met?

# Create a histogram of the residuals
hist(m1$residuals, main = "Histogram of Residuals", xlab = "Residuals")

This histogram of residuals apparently symmetric and bell-shaped, it would say that the nearly normal residuals condition is met.

Exercise 9

Based on the residuals vs. fitted plot, does the constant variability condition appear to be met?

Based on the residuals vs. fitted plot, the residuals are randomly scattered around the horizontal line at zero (the x-axis), so, it is said that the constant variability condition is likely met.

More Practice

Exercie 10

Choose another freedom variable and a variable you think would strongly correlate with it. Produce a scatterplot of the two variables and fit a linear model. At a glance, does there seem to be a linear relationship?

Answer: As we can see the scatterplot, two variables are in linear relationship.

plot(hfi$pf_expression_control, hfi$pf_religion, 
     xlab = "Political Pressures and Media Control Score",
     ylab = "Total Human Freedom Score",
     col = "gold",
     main = "Scatter Plot of pf_expression_control vs. pf_religion")

# Fit a linear model
lm_model1 <- lm(pf_religion ~ pf_expression_control, data = hfi)

# Add the regression line
abline(lm_model1, col = "navy")

Exercie 11

How does this relationship compare to the relationship between pf_expression_control and pf_score? Use the R2 values from the two model summaries to compare. Does your independent variable seem to predict your dependent one better? Why or why not?

m1 <- lm(pf_score ~ pf_expression_control, data = hfi)
m2 <- lm(pf_religion ~ pf_expression_control, data = hfi)
summary(m1)$r.squared  # R-squared for pf_score vs. pf_expression_control

## [1] 0.6342361

summary(m2)$r.squared  # R-squared for pf_religion vs. pf_expression_control

## [1] 0.1764961

Answer:

The R squared values represent the proportion of variability in the dependent variable that is explained by the independent variable (pf_expression_control). A higher R^2 value indicates a stronger relationship or a better predictive power of the independent variable on the dependent variable.

Exercie 12

What’s one freedom relationship you were most surprised about and why? Display the model diagnostics for the regression model analyzing this relationship.

Answer:

m3 <- lm(pf_rank ~ pf_expression_control, data = hfi)
summary(m3)

## 
## Call:
## lm(formula = pf_rank ~ pf_expression_control, data = hfi)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -105.283  -19.487   -3.506   19.578  118.513 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           161.1625     1.8471   87.25   <2e-16 ***
## pf_expression_control -15.9629     0.3234  -49.35   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26.74 on 1376 degrees of freedom
## Multiple R-squared:  0.639,  Adjusted R-squared:  0.6388 
## F-statistic:  2436 on 1 and 1376 DF,  p-value: < 2.2e-16

ggplot(data = hfi, aes(x = pf_expression_control, y = pf_rank)) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula = 'y ~ x'

I am so surprised when I see a negative regression line in the relationship plot of pf_rank Vs pf_expression_control. As the slope in regression model is negative, it means that, an increase in the amount of political pressure on media content is associated with a decrease in the personal freedom of rank. In other words, higher political pressure is related to lower levels of personal freedom of rank. The negative slope signifies that there is an inverse relationship between political pressure on media content and personal freedom of rank in the data set. The summary statistical values of the model such as R^2 indicates a strong explanatory power of the independent variable(s), the adjusted R^2 accounts for model complexity, and the F-statistic tests the overall significance of the model. and the residual standard error measures the typical error in the model’s predictions.

Introduction to linear regression

Lwin Shwe

2023-11-05

Overview

Exercise 1

Exercise 2

Exercise 3

Exercise 4

Exercise 5

Exercise 6

Exercise 7

Exercise 8

Exercise 9

More Practice

Exercie 10

Exercie 11

Exercie 12