homwork6data101

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

set part

AllCountries <- read_csv("AllCountries.csv")

Rows: 217 Columns: 26
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr  (2): Country, Code
dbl (24): LandArea, Population, Density, GDP, Rural, CO2, PumpPrice, Militar...

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Calculate the correlation coefficient between the Population and GDP v

correlation <- cor(AllCountries$Population, AllCountries$GDP, use = "complete.obs")
correlation

[1] -0.0449688

Fit a linear model

lm_life_gdp <- lm(LifeExpectancy ~ GDP, data = AllCountries)

# Display the summary of the linear model
summary(lm_life_gdp)


Call:
lm(formula = LifeExpectancy ~ GDP, data = AllCountries)

Residuals:
    Min      1Q  Median      3Q     Max 
-16.352  -3.882   1.550   4.458   9.330 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 6.842e+01  5.415e-01  126.36   <2e-16 ***
GDP         2.476e-04  2.141e-05   11.56   <2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.901 on 177 degrees of freedom
  (38 observations deleted due to missingness)
Multiple R-squared:  0.4304,    Adjusted R-squared:  0.4272 
F-statistic: 133.7 on 1 and 177 DF,  p-value: < 2.2e-16

Fit a linear model for Health vs CO2

lm_health_co2 <- lm(Health ~ CO2, data = AllCountries)

summary(lm_health_co2)


Call:
lm(formula = Health ~ CO2, data = AllCountries)

Residuals:
    Min      1Q  Median      3Q     Max 
-11.413  -4.176  -0.980   3.210  26.729 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)  9.84685    0.52417  18.786   <2e-16 ***
CO2          0.17481    0.06872   2.544   0.0118 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.688 on 183 degrees of freedom
  (32 observations deleted due to missingness)
Multiple R-squared:  0.03416,   Adjusted R-squared:  0.02888 
F-statistic: 6.471 on 1 and 183 DF,  p-value: 0.01179

Plot the residuals

par(mfrow = c(1, 2)) # Set up a 1x2 
plot(lm_health_co2, which = 1)  
plot(lm_health_co2, which = 2)

Plot Health vs CO2 with regression line

ggplot(AllCountries, aes(x = CO2, y = Health)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "blue") +
  labs(title = "Health vs CO2 Emissions", x = "CO2 Emissions (Metric Tons per Capita)", y = "Health Expenditure (%)")

`geom_smooth()` using formula = 'y ~ x'

Warning: Removed 32 rows containing non-finite outside the scale range
(`stat_smooth()`).

Warning: Removed 32 rows containing missing values or values outside the scale range
(`geom_point()`).

Extra Credit

lm_unemploy_gdp <- lm(Unemployment ~ GDP, data = AllCountries)

summary(lm_unemploy_gdp)


Call:
lm(formula = Unemployment ~ GDP, data = AllCountries)

Residuals:
   Min     1Q Median     3Q    Max 
-7.477 -3.687 -1.282  1.855 22.546 

Coefficients:
              Estimate Std. Error t value Pr(>|t|)    
(Intercept)  7.795e+00  5.220e-01   14.93   <2e-16 ***
GDP         -4.418e-05  2.036e-05   -2.17   0.0314 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 5.593 on 171 degrees of freedom
  (44 observations deleted due to missingness)
Multiple R-squared:  0.02681,   Adjusted R-squared:  0.02111 
F-statistic:  4.71 on 1 and 171 DF,  p-value: 0.03137

Plot the regression line

ggplot(AllCountries, aes(x = GDP, y = Unemployment)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") +
  labs(title = "Unemployment vs GDP", x = "GDP per Capita", y = "Unemployment (%)")

`geom_smooth()` using formula = 'y ~ x'

Warning: Removed 44 rows containing non-finite outside the scale range
(`stat_smooth()`).

Warning: Removed 44 rows containing missing values or values outside the scale range
(`geom_point()`).