#Lab 3
#Setting up R for Lab 3
#Setting working directory
setwd("/Users/timliu/Desktop/SOC252/Lab 3")
#Loading ESS data into R
lab3_data <- read.csv("Lab 3.csv")
#Load libraries that we will use
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(tidyr)
library(ggplot2)
library(broom)
#Question 1
#I chose to use the European Social Survey (ESS) as my dataset as it is an academically driven survey
#that collects data from countries across Europe. They collect data every 2 years and have been doing
#so since 2001, which can signify that it is has a wealth of data and information that I can work with.
#The survey asks social, political and cultural issues across Europe and utilizes rigorous and
#standardized methodologies to ensure reliability and comparability. It is also a freely accesible
#dataset which allows me to select the data I work with (by countries(s) and variable(s))and save me
#time in cleaning my data. Furthermore, I will be using the data on Denmark given by the ESS data
#portal and filter for the variables I will also be using for my group project; happiness, satisfaction
#with democracy and education level. This way I have a foundation and better knowdlege of the data I
#will be working with for the project.
#Question 2
#Clean my code to get rid of missing values
lab3_data <- lab3_data %>%
filter(!happy %in% c(77, 88, 99, NA),
!stfdem %in% c(77, 88, 99, NA),
!edulvla %in% c(0, 55, 77, 88, 99, NA))
#Recode education level into two categories
lab3_data <- lab3_data %>%
mutate(edulvla = case_when(
edulvla %in% 1:4 ~ "Completed Secondary or Less", # 1 to 4 is secondary or less
edulvla == 5 ~ "Completed Post Secondary or Higher" # 5 is post-secondary or higher
))
table(lab3_data$edulvla)
##
## Completed Post Secondary or Higher Completed Secondary or Less
## 2257 3656
#Calculate the percentage distribution of the education categories
education_percentage <- lab3_data %>%
count(edulvla) %>%
mutate(Percentage = n / sum(n) * 100)
print(education_percentage)
## edulvla n Percentage
## 1 Completed Post Secondary or Higher 2257 38.17013
## 2 Completed Secondary or Less 3656 61.82987
#Summary statistics for happiness, highest education level and satisfaction with democracy
summary_table <- lab3_data %>%
summarise(
Mean_Happy = mean(happy, na.rm = TRUE),
Median_Happy = median(happy, na.rm = TRUE),
SD_Happy = sd(happy, na.rm = TRUE),
Mean_Satisfaction = mean(stfdem, na.rm = TRUE),
Median_Satisfaction = median(stfdem, na.rm = TRUE),
SD_Satisfaction = sd(stfdem, na.rm = TRUE)
)
print(summary_table)
## Mean_Happy Median_Happy SD_Happy Mean_Satisfaction Median_Satisfaction
## 1 8.342804 8 1.369377 7.354811 8
## SD_Satisfaction
## 1 1.903082
#Histogram for Happiness
ggplot(lab3_data, aes(x = happy)) +
geom_histogram(binwidth = 1, fill = "lightblue", color = "black", alpha = 0.7) +
theme_minimal() +
labs(
title = "Distribution of Happiness Scores",
x = "Happiness Score",
y = "Count"
)

#Histogram for Satisfaction with Democracy
ggplot(lab3_data, aes(x = stfdem)) +
geom_histogram(binwidth = 1, fill = "lightgreen", color = "black", alpha = 0.7) +
theme_minimal() +
labs(
title = "Distribution of Satisfaction with Democracy",
x = "Satisfaction with Democracy Score",
y = "Count"
)

#From the summary table we can see the mean and median for both happiness (8.34 and 8) and
#satisfaction with democracy (7.35 and 8) for Denmark are pretty high (both on a scale of 1-10 with
#0 being extremely unhappy/ dissatisfied and 10 extremely happy/ satisfied). Supported by the
#left-skewed histogram and normal distribution for both we can see that both happiness and satisfaction
#with democracy are high in Denmark. The SD of 1.37 and 1.90 also puts the majority of the data above
#a score of 5 which would be the middle score. From the highest education level category we can see
#that 38.17% have completed education post-secondary and/or higher and 61.83% have completed secondary
#or less.
#Question 3
#Happiness level will be my dependent variable and satisfaction with democracy and highest level of
#education will be my 2 independent variables.
#Bar Plot of Average Happiness by Satisfaction with Democracy
lab3_data %>%
group_by(stfdem) %>%
summarise(Mean_Happiness = mean(happy, na.rm = TRUE)) %>%
ggplot(aes(x = stfdem, y = Mean_Happiness)) +
geom_bar(stat = "identity", fill = "steelblue", color = "black", width = 0.7) +
theme_minimal() +
labs(
title = "Average Happiness by Satisfaction with Democracy",
x = "Satisfaction with Democracy (1 = Very Dissatisfied, 10 = Very Satisfied)",
y = "Average Happiness"
) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1)
)

#Calculate correlation between Happiness and Satisfaction with Democracy
correlation_happy_stfdem <- cor(lab3_data$happy, lab3_data$stfdem, use = "complete.obs")
print(correlation_happy_stfdem)
## [1] 0.1679637
#As we produced a box plot for happiness and satisfaction with democracy, we took the mean happiness scores
#of each score level of satisfaction with democracy. We only see a slight postive increase in scores as
#level of satisfaction with democracy increases. After finding the correlation we get a value of 0.168
#which shows a weak positive relationship between the two variables which isn't significant enough to be
#strongly predictive. This likely hints that there are many other factors that we didn't account for.
#Boxplot for Happiness by Education Category
ggplot(lab3_data, aes(x = edulvla, y = happy, fill = edulvla)) +
geom_boxplot(alpha = 0.7) +
scale_fill_manual(values = c("lightcoral", "lightblue")) +
theme_minimal() +
labs(
title = "Happiness Distribution by Education Category",
x = "Education Level",
y = "Happiness",
fill = "Education Category"
) +
theme(
plot.title = element_text(hjust = 0.5, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1)
)

#Convert Education Level to numeric (1 for Secondary or Less, 2 for Post Secondary or Higher)
lab3_data$edulvla_numeric <- ifelse(lab3_data$edulvla == "Completed Secondary or Less", 1, 2)
#Calculate the correlation between Education Level and Happiness
correlation_value_edulvla_happy <- cor(lab3_data$edulvla_numeric, lab3_data$happy, use = "complete.obs", method = "pearson")
print(correlation_value_edulvla_happy)
## [1] -0.01619732
#From the box plot we see that both plots are nearly identical with a IQR around 7.7 and 8.8,
#whiskers of the same length, same median and same outliers. This would signify that there is
#no difference in happiness levels in relation to education level and when we calculate the
#coefficient we get a value of -0.016 which is super close to 0 meaning no linear relationship.
#If I was to redo this question I would probably make satisfaction with democracy my dependent
#variable instead as I think that would have a greater correlation with education level.
#Question 4
#1. I will now use satisfaction with democracy as my dependent variable and education level
#and happiness as my independent variables due to the low correlation values from question 3,
#to see if I will get higher levels of correlation from satisfaction with democracy and
#education level.
#2.
#Education Level (edulvla_numeric) → Satisfaction with Democracy (stfdem)
#↘
#Happiness (happy)
#I hypothesize that higher education leads to more positive attitudes towards democracy and
#also that happier individuals tend to be more satisfied with democracy, as they might view
#their political system more favorably.
#3. I will be using linear regression as satisfaction with democracy is a continuous variable.
#The formula would be Satisfaction with Democracy= β0+β1 x Happiness+β2 x Education Level+ϵ
#β0 is the intercept where the expected satisfaction with democracy when both predictors
#(Happiness and Education Level) are zero.
#β1 is the coefficient for Happiness that represents how much satisfaction with democracy is
#expected to change for each one-unit increase in happiness, assuming education level is held
#constant.
#β2 is the coefficient for Education Level that represents how much satisfaction with democracy
#is expected to change for each one-unit increase in education level, assuming happiness is
#held constant.
#ϵ is the error term that represents the random variation or unaccounted-for factors affecting
#satisfaction with democracy.
#Question 5
#Running the linear regression mode
model <- lm(stfdem ~ happy + edulvla_numeric, data = lab3_data)
summary(model)
##
## Call:
## lm(formula = stfdem ~ happy + edulvla_numeric, data = lab3_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.9626 -0.9043 0.3894 1.2728 4.1564
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.90417 0.16621 29.506 < 2e-16 ***
## happy 0.23545 0.01775 13.266 < 2e-16 ***
## edulvla_numeric 0.35198 0.05003 7.036 2.2e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.869 on 5910 degrees of freedom
## Multiple R-squared: 0.03628, Adjusted R-squared: 0.03596
## F-statistic: 111.3 on 2 and 5910 DF, p-value: < 2.2e-16
confint(model)
## 2.5 % 97.5 %
## (Intercept) 4.5783362 5.2300066
## happy 0.2006548 0.2702436
## edulvla_numeric 0.2539123 0.4500515
#Interpretation of the Coefficients
#Intercept (4.90417): The intercept tells us that when Happiness and Education Level are both zero,
#the expected level of Satisfaction with Democracy is 4.90. This can be interpreted as the baseline
#level of satisfaction in Denmark when both predictors are at their lowest.
#The coefficient for Happiness is 0.23545, which means that for each 1-point increase in Happiness,
#the Satisfaction with Democracy is expected to increase by 0.24 points, assuming education level
#is held constant. The very low p-value (less than 2e-16) shows that this result is highly statistically
#significant, meaning the effect of happiness on satisfaction with democracy is not due to random chance.
#The coefficient for Education Level is 0.35198, indicating that a one-unit increase in education level
#(from secondary education to post-secondary or higher) results in an increase of 0.35 points in
#Satisfaction with Democracy, holding happiness constant. This effect is also highly significant with a
#very low p-value (2.2e-12), confirming the influence of education on political satisfaction.
#For Happiness, the 95% confidence interval is between 0.20 and 0.27, indicating a positive effect on
#satisfaction with democracy. For Education Level, the interval ranges from 0.25 to 0.45, also indicating
#a positive influence. The Intercept has a confidence interval between 4.58 and 5.23, which means the
#baseline satisfaction with democracy, when both predictors are zero, is expected to fall within this range.
#Multiple R-squared: 0.03628:This value tells us that 3.63% of the variation in Satisfaction with Democracy
#is explained by the two predictors. This is a relatively low R-squared, indicating that while these variables
#are significant, there are other factors not included in the model that likely contribute to satisfaction with
#democracy.
#Adjusted R-squared: 0.03596: This adjusts the R-squared for the number of predictors in the model, and it is
#very close to the multiple R-squared, confirming that the model is relatively simple and doesn’t include
#excessive predictors.
#The F-statistic test had a very low p-value (< 2.2e-16), which confirms that the model as a whole is
#statistically significant, meaning that at least one of the predictors contributes to explaining
#Satisfaction with Democracy.
#The regression analysis reveals that both happiness and education level significantly influence satisfaction
#with democracy in Denmark. Specifically, for each 1-point increase in happiness, satisfaction with democracy
#is expected to increase by 0.24 points, while a one-unit increase in education level
#(from secondary to post-secondary or higher) results in an increase of 0.35 points in satisfaction with
#democracy. These relationships are statistically significant, with very low p-values
#(less than 2e-16 and 2.2e-12, respectively), indicating that these effects are unlikely to be due to random
#chance. The intercept of the model (4.90) represents the expected level of satisfaction with democracy when
#both happiness and education level are at their lowest values. The confidence intervals for both happiness
#(0.20 to 0.27) and education level (0.25 to 0.45) further confirm the positive relationship between these
#variables and satisfaction with democracy. However, despite the significant results, the model’s R-squared
#value of 3.63% suggests that these two predictors explain only a small portion of the variation in satisfaction
#with democracy, indicating that other factors not included in the model likely contribute more to political
#attitudes. The adjusted R-squared value (0.03596) is very close to the multiple R-squared, reinforcing that
#the model is simple and doesn’t include excessive predictors. The F-statistic’s very low p-value (< 2.2e-16)
#confirms that the overall model is statistically significant, meaning that at least one of the predictors
#contributes meaningfully to explaining satisfaction with democracy.