chooseCRANmirror(graphics = FALSE, ind = 1)
install.packages("gapminder")
##
## The downloaded binary packages are in
## /var/folders/cw/pl_wqhfn5ps7qhqx3q5mmzcm0000gn/T//RtmpEIT6Nr/downloaded_packages
library(gapminder)
summary(gapminder)
## country continent year lifeExp
## Afghanistan: 12 Africa :624 Min. :1952 Min. :23.60
## Albania : 12 Americas:300 1st Qu.:1966 1st Qu.:48.20
## Algeria : 12 Asia :396 Median :1980 Median :60.71
## Angola : 12 Europe :360 Mean :1980 Mean :59.47
## Argentina : 12 Oceania : 24 3rd Qu.:1993 3rd Qu.:70.85
## Australia : 12 Max. :2007 Max. :82.60
## (Other) :1632
## pop gdpPercap
## Min. :6.001e+04 Min. : 241.2
## 1st Qu.:2.794e+06 1st Qu.: 1202.1
## Median :7.024e+06 Median : 3531.8
## Mean :2.960e+07 Mean : 7215.3
## 3rd Qu.:1.959e+07 3rd Qu.: 9325.5
## Max. :1.319e+09 Max. :113523.1
##
gapminder07 <- subset(gapminder, year == 2007)
head(gapminder07)
## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 2007 43.8 31889923 975.
## 2 Albania Europe 2007 76.4 3600523 5937.
## 3 Algeria Africa 2007 72.3 33333216 6223.
## 4 Angola Africa 2007 42.7 12420476 4797.
## 5 Argentina Americas 2007 75.3 40301927 12779.
## 6 Australia Oceania 2007 81.2 20434176 34435.
#View(gapminder07)
sequence_lifeExp <- unique(gapminder$lifeExp)
sequence_pop <- unique(gapminder$pop)
sequence_gdpPercap <- unique(gapminder$gdpPercap)
head(sequence_lifeExp)
## [1] 28.801 30.332 31.997 34.020 36.088 38.438
head(sequence_pop)
## [1] 8425333 9240934 10267083 11537966 13079460 14880372
head(sequence_gdpPercap)
## [1] 779.4453 820.8530 853.1007 836.1971 739.9811 786.1134
NewVectorNAVals <- c(5, NA, 3, NA, 9)
missing_values <- is.na(NewVectorNAVals)
updated_vector <- replace(NewVectorNAVals, missing_values, 0)
updated_vector
## [1] 5 0 3 0 9
For Life expectancy the minimum life expectancy is 23.60, median life expectancy is 60.71 and the maximum is 82.60 (in years) when we add these life expectancies to the context of the data, we can draw more specific hypothesis about the countries or the other variables.
summary_lifeExp <- summary(gapminder$lifeExp)
summary_gdpPercap <- summary(gapminder$gdpPercap)
summary_pop <- summary(gapminder$pop)
summary_lifeExp
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 23.60 48.20 60.71 59.47 70.85 82.60
summary_gdpPercap
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 241.2 1202.1 3531.8 7215.3 9325.5 113523.1
summary_pop
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 6.001e+04 2.794e+06 7.024e+06 2.960e+07 1.959e+07 1.319e+09
I thought the box plot would be an easy way to show the outliers of the datset and then also show the correlation between average life expectancy and the continents. From there we could even go further by plotting every single indivisual country on the dataset, but I decided for the sake of this, every continent would be easier to read.
# Scatter Plot
library(ggplot2)
ggplot(gapminder, aes(x = gdpPercap, y = lifeExp)) +
geom_point(aes(color = continent, size = pop), alpha = 0.6, show.legend = TRUE) + # Smaller points and show legend
labs(title = "Life Expectancy vs GDP per Capita",
x = "GDP per Capita (Square Root Scale)",
y = "Life Expectancy") +
theme_minimal(base_size = 15) + # Minimal theme with larger font size
theme(
plot.title = element_text(size = 20, face = "bold", color = "darkblue", hjust = 0.5),
axis.title = element_text(size = 14, face = "bold"),
axis.text = element_text(size = 12),
panel.grid.major = element_line(color = "gray80", size = 0.5), # Lighter grid lines
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "white", color = "white"),
plot.background = element_rect(fill = "lightgray"),
legend.position = "right", # Move legend to the right for clearer visualization
legend.title = element_text(face = "bold"),
legend.text = element_text(size = 12)
) +
scale_color_manual(values = continent_colors) + # Custom continent color palette
scale_size_continuous(range = c(1, 5)) + # Smaller points for better readability
scale_x_continuous(trans = 'sqrt', limits = c(0, 60000))
## Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).
# Box Chart Below
ggplot(gapminder, aes(x = continent, y = lifeExp, fill = continent)) +
geom_boxplot(outlier.colour = "red", outlier.size = 3) + # Outlier customization
labs(title = "Life Expectancy by Continent",
x = "Continent",
y = "Life Expectancy") +
theme_minimal(base_size = 15) + # Minimal theme with larger font size
theme(
plot.title = element_text(size = 20, face = "bold", color = "darkblue", hjust = 0.5),
axis.title = element_text(size = 14, face = "bold"),
axis.text = element_text(size = 12),
panel.grid.major = element_line(color = "gray80", size = 0.5),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "white", color = "white"),
plot.background = element_rect(fill = "lightgray"),
legend.position = "right",
legend.title = element_text(face = "bold"),
legend.text = element_text(size = 12)
) +
scale_fill_manual(values = continent_colors) + # Custom continent color palette
guides(fill = guide_legend(title = "Continent")) # Title for the legend
#Bar Plot
ggplot(gapminder, aes(x = continent, fill = continent)) +
geom_bar(show.legend = FALSE) + # Remove legend for better visualization
labs(title = "Number of Countries per Continent",
x = "Continent",
y = "Number of Countries") +
theme_minimal(base_size = 15) + # Minimal theme with larger font size
theme(
plot.title = element_text(size = 20, face = "bold", color = "darkblue", hjust = 0.5),
axis.title = element_text(size = 14, face = "bold"),
axis.text = element_text(size = 12, angle = 45, hjust = 1),
panel.grid.major = element_line(color = "gray80", size = 0.5),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "white", color = "white"),
plot.background = element_rect(fill = "lightgray")
) +
scale_fill_manual(values = continent_colors)
#Bar Plot 2
ggplot(gapminder, aes(x = continent, y = lifeExp, fill = continent)) +
geom_bar(stat = "identity", show.legend = FALSE) + # Show the life expectancy as bars, no legend
facet_wrap(~ year) + # Facet by year
labs(title = "Life Expectancy by Continent Across Years",
x = "Continent",
y = "Average Life Expectancy") +
theme_minimal(base_size = 15) + # Clean theme with large font size
theme(
plot.title = element_text(size = 20, face = "bold", color = "darkblue", hjust = 0.5),
axis.title = element_text(size = 14, face = "bold"),
axis.text = element_text(size = 12, angle = 45, hjust = 1), # Rotate axis labels for readability
strip.text = element_text(size = 14, face = "bold"), # Bold year labels for facets
panel.grid.major = element_line(color = "gray80", size = 0.5),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "white", color = "white"),
plot.background = element_rect(fill = "lightgray")
) +
scale_fill_manual(values = continent_colors) + # Apply custom continent color palette
scale_y_continuous(limits = c(0, 100)) # Limit the y-axis to 100 for life expectancy
## Warning: Removed 1633 rows containing missing values or values outside the scale range
## (`geom_bar()`).
model <- lm(lifeExp ~ gdpPercap, data = gapminder07)
summary(model)
##
## Call:
## lm(formula = lifeExp ~ gdpPercap, data = gapminder07)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.828 -6.316 1.922 6.898 13.128
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.957e+01 1.010e+00 58.95 <2e-16 ***
## gdpPercap 6.371e-04 5.827e-05 10.93 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.899 on 140 degrees of freedom
## Multiple R-squared: 0.4606, Adjusted R-squared: 0.4567
## F-statistic: 119.5 on 1 and 140 DF, p-value: < 2.2e-16
ggplot(gapminder07, aes(x = gdpPercap, y = lifeExp, color = continent)) +
geom_point(alpha = 0.7) + # Color points by continent with transparency
geom_smooth(method = "lm", col = "red", size = 1.2, se = FALSE) + # Red regression line with no confidence interval
scale_x_log10() + # Log scale for better visualization
labs(title = "Linear Regression: Life Expectancy vs GDP per Capita in 2007",
x = "GDP per Capita (log scale)",
y = "Life Expectancy") +
theme_minimal(base_size = 15) + # Clean theme with larger font size
theme(
plot.title = element_text(size = 20, face = "bold", color = "darkblue", hjust = 0.5),
axis.title = element_text(size = 14, face = "bold"),
axis.text = element_text(size = 12),
legend.title = element_text(size = 14, face = "bold"), # Adjust legend title size
legend.text = element_text(size = 12), # Adjust legend text size
legend.position = "right", # Place the legend to the right
strip.text = element_text(size = 14, face = "bold"),
panel.grid.major = element_line(color = "gray80", size = 0.5),
panel.grid.minor = element_blank(),
panel.background = element_rect(fill = "white", color = "white"),
plot.background = element_rect(fill = "lightgray")
) +
scale_color_manual(values = continent_colors) # Apply custom continent color palette
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'
new_data <- data.frame(gdpPercap = 20000)
predicted_lifeExp <- predict(model, newdata = new_data)
predicted_lifeExp
## 1
## 72.30833