###Access the dataset data(hfi) ### Exercise 1. Dataset dimensions and observations #Find the dimensions of the hfi dataset dim(hfi) #Answer: 1458 rows 123 variables
hfi_2016 <- hfi %>% filter(year == 2016) %>% select(year,countries,pf_score,pf_expression_control,hf_score, pf_rank)
#Relationship between pf_score and pf_expression_control ggplot(hfi_2016, aes(x = pf_expression_control, y = pf_score)) + geom_point() + geom_smooth(method = “lm”, se = FALSE) + labs( title = “Personal Freedom vs Expression Control”, x = “Expression Control (pf_expression_control)”, y = “Personal Freedom Score (pf_score)” ) # Note: A scatterplot is used for two numerical variables. #The relationship between the two variables is linear #If you know a country’s pf_expression_control, or its socre out of 10 #with 0 being the most, of political pressures and controls on media content # Yes, I believe that linear model can be used to predict personal freedom score # Find the correlation coefficient hfi_2016 |> summarise(cor(pf_expression_control, pf_score)) #Answer is 0.845
#Based on the scatterplot, the relationship is: #form: Linear #Direction: Positive #Strength: Strong #Unusual observation: A few outliers may exist at lower points
#line points plot_ss(x = pf_expression_control, y = pf_score, data = hfi_2016) #Showing squares in the squared residuals plot_ss(x = pf_expression_control, y = pf_score, data = hfi_2016,showSquares= TRUE)
#It is difficult to try to get the squares in the correct line, the lm function can be used to fit in the regression line
m1 <- lm(pf_score ~ pf_expression_control, data = hfi_2016)
tidy(m1) model <- lm(hf_score ~ pf_expression_control, data = hfi_2016) summary(model) tidy(m2)
#Exercise 7. ggplot(data = hfi_2016, aes(x = pf_expression_control, y = pf_score)) + geom_point() + geom_smooth(method = “lm”, se = FALSE)
Prediction <- 4.28 + (0.542*3) #Answer=5.906
hfi_2016 %>% filter(pf_expression_control==3) %>%
select(countries,pf_score) #<
observed_minus_predicted <- 5.47- 5.906
ggplot(data = m1_aug, aes(x = .fitted, y = .resid)) + geom_point() + geom_hline(yintercept = 0, linetype = “dashed”, color = “red”) + xlab(“Fitted values”) + ylab(“Residuals”) model <- lm(pf_score ~ pf_expression_control, data = hfi_2016) #Exercise 8. There is a weak correleation between the variables
#Exercise 9 ggplot(data = m1, aes(x = .resid)) + geom_histogram(binwidth = 0.25) + xlab(“Residuals”) plot(model, which = 1) #If there is a bell shape it signifies normal residual conditions
#Exercise 10 #There is constant variability #Conditions Met: constant variability is met if the points are #scattered randomly around the horizontal dash line (y=0) with #equal vertical spread across the x-axis. This shows that the #error is consistent regardless of predicted value