## Warning: package 'statsr' was built under R version 4.0.3
What are the dimensions of the data set? What does each row represent?
1458 rows by 123 columns. Each row represents a single country’s qualities per year from 2010-2016.
## [1] 4683 4457 4102 4590 4839 4820 4928 4605 4457 4952 4784 5332 5200 4910 4617
## [16] 3997 3919 3395 3536 3181 2746 2722 2840 2908 2959 3179 3349 3382 3289 3013
## [31] 2781 3247 4107 4803 4881 5681 4858 4319 5322 5560 5829 5719 6061 6120 5822
## [46] 5738 5717 5847 6203 6033 6041 6299 6533 6744 7158 7127 7246 7119 7214 7101
## [61] 7167 7302 7392 7316 7483 6647 6713 7229 7767 7626 7452 7061 7514 7656 7683
## [76] 5738 7779 7417 7687 7623 7380 7288
## # A tibble: 1,458 x 123
## year ISO_code countries region pf_rol_procedur~ pf_rol_civil pf_rol_criminal
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2016 ALB Albania Easte~ 6.66 4.55 4.67
## 2 2016 DZA Algeria Middl~ NA NA NA
## 3 2016 AGO Angola Sub-S~ NA NA NA
## 4 2016 ARG Argentina Latin~ 7.10 5.79 4.34
## 5 2016 ARM Armenia Cauca~ NA NA NA
## 6 2016 AUS Australia Ocean~ 8.44 7.53 7.36
## 7 2016 AUT Austria Weste~ 8.97 7.87 7.67
## 8 2016 AZE Azerbaij~ Cauca~ NA NA NA
## 9 2016 BHS Bahamas Latin~ 6.93 6.01 6.26
## 10 2016 BHR Bahrain Middl~ NA NA NA
## # ... with 1,448 more rows, and 116 more variables: pf_rol <dbl>,
## # pf_ss_homicide <dbl>, pf_ss_disappearances_disap <dbl>,
## # pf_ss_disappearances_violent <dbl>, pf_ss_disappearances_organized <dbl>,
## # pf_ss_disappearances_fatalities <dbl>, pf_ss_disappearances_injuries <dbl>,
## # pf_ss_disappearances <dbl>, pf_ss_women_fgm <dbl>,
## # pf_ss_women_missing <dbl>, pf_ss_women_inheritance_widows <dbl>,
## # pf_ss_women_inheritance_daughters <dbl>, pf_ss_women_inheritance <dbl>,
## # pf_ss_women <dbl>, pf_ss <dbl>, pf_movement_domestic <dbl>,
## # pf_movement_foreign <dbl>, pf_movement_women <dbl>, pf_movement <dbl>,
## # pf_religion_estop_establish <dbl>, pf_religion_estop_operate <dbl>,
## # pf_religion_estop <dbl>, pf_religion_harassment <dbl>,
## # pf_religion_restrictions <dbl>, pf_religion <dbl>,
## # pf_association_association <dbl>, pf_association_assembly <dbl>,
## # pf_association_political_establish <dbl>,
## # pf_association_political_operate <dbl>, pf_association_political <dbl>,
## # pf_association_prof_establish <dbl>, pf_association_prof_operate <dbl>,
## # pf_association_prof <dbl>, pf_association_sport_establish <dbl>,
## # pf_association_sport_operate <dbl>, pf_association_sport <dbl>,
## # pf_association <dbl>, pf_expression_killed <dbl>,
## # pf_expression_jailed <dbl>, pf_expression_influence <dbl>,
## # pf_expression_control <dbl>, pf_expression_cable <dbl>,
## # pf_expression_newspapers <dbl>, pf_expression_internet <dbl>,
## # pf_expression <dbl>, pf_identity_legal <dbl>,
## # pf_identity_parental_marriage <dbl>, pf_identity_parental_divorce <dbl>,
## # pf_identity_parental <dbl>, pf_identity_sex_male <dbl>,
## # pf_identity_sex_female <dbl>, pf_identity_sex <dbl>,
## # pf_identity_divorce <dbl>, pf_identity <dbl>, pf_score <dbl>,
## # pf_rank <dbl>, ef_government_consumption <dbl>,
## # ef_government_transfers <dbl>, ef_government_enterprises <dbl>,
## # ef_government_tax_income <dbl>, ef_government_tax_payroll <dbl>,
## # ef_government_tax <dbl>, ef_government <dbl>, ef_legal_judicial <dbl>,
## # ef_legal_courts <dbl>, ef_legal_protection <dbl>, ef_legal_military <dbl>,
## # ef_legal_integrity <dbl>, ef_legal_enforcement <dbl>,
## # ef_legal_restrictions <dbl>, ef_legal_police <dbl>, ef_legal_crime <dbl>,
## # ef_legal_gender <dbl>, ef_legal <dbl>, ef_money_growth <dbl>,
## # ef_money_sd <dbl>, ef_money_inflation <dbl>, ef_money_currency <dbl>,
## # ef_money <dbl>, ef_trade_tariffs_revenue <dbl>,
## # ef_trade_tariffs_mean <dbl>, ef_trade_tariffs_sd <dbl>,
## # ef_trade_tariffs <dbl>, ef_trade_regulatory_nontariff <dbl>,
## # ef_trade_regulatory_compliance <dbl>, ef_trade_regulatory <dbl>,
## # ef_trade_black <dbl>, ef_trade_movement_foreign <dbl>,
## # ef_trade_movement_capital <dbl>, ef_trade_movement_visit <dbl>,
## # ef_trade_movement <dbl>, ef_trade <dbl>,
## # ef_regulation_credit_ownership <dbl>, ef_regulation_credit_private <dbl>,
## # ef_regulation_credit_interest <dbl>, ef_regulation_credit <dbl>,
## # ef_regulation_labor_minwage <dbl>, ef_regulation_labor_firing <dbl>,
## # ef_regulation_labor_bargain <dbl>, ef_regulation_labor_hours <dbl>, ...
## [1] 1458 123
## [1] "year" "ISO_code"
## [3] "countries" "region"
## [5] "pf_rol_procedural" "pf_rol_civil"
## [7] "pf_rol_criminal" "pf_rol"
## [9] "pf_ss_homicide" "pf_ss_disappearances_disap"
## [11] "pf_ss_disappearances_violent" "pf_ss_disappearances_organized"
## [13] "pf_ss_disappearances_fatalities" "pf_ss_disappearances_injuries"
## [15] "pf_ss_disappearances" "pf_ss_women_fgm"
## [17] "pf_ss_women_missing" "pf_ss_women_inheritance_widows"
## [19] "pf_ss_women_inheritance_daughters" "pf_ss_women_inheritance"
## [21] "pf_ss_women" "pf_ss"
## [23] "pf_movement_domestic" "pf_movement_foreign"
## [25] "pf_movement_women" "pf_movement"
## [27] "pf_religion_estop_establish" "pf_religion_estop_operate"
## [29] "pf_religion_estop" "pf_religion_harassment"
## [31] "pf_religion_restrictions" "pf_religion"
## [33] "pf_association_association" "pf_association_assembly"
## [35] "pf_association_political_establish" "pf_association_political_operate"
## [37] "pf_association_political" "pf_association_prof_establish"
## [39] "pf_association_prof_operate" "pf_association_prof"
## [41] "pf_association_sport_establish" "pf_association_sport_operate"
## [43] "pf_association_sport" "pf_association"
## [45] "pf_expression_killed" "pf_expression_jailed"
## [47] "pf_expression_influence" "pf_expression_control"
## [49] "pf_expression_cable" "pf_expression_newspapers"
## [51] "pf_expression_internet" "pf_expression"
## [53] "pf_identity_legal" "pf_identity_parental_marriage"
## [55] "pf_identity_parental_divorce" "pf_identity_parental"
## [57] "pf_identity_sex_male" "pf_identity_sex_female"
## [59] "pf_identity_sex" "pf_identity_divorce"
## [61] "pf_identity" "pf_score"
## [63] "pf_rank" "ef_government_consumption"
## [65] "ef_government_transfers" "ef_government_enterprises"
## [67] "ef_government_tax_income" "ef_government_tax_payroll"
## [69] "ef_government_tax" "ef_government"
## [71] "ef_legal_judicial" "ef_legal_courts"
## [73] "ef_legal_protection" "ef_legal_military"
## [75] "ef_legal_integrity" "ef_legal_enforcement"
## [77] "ef_legal_restrictions" "ef_legal_police"
## [79] "ef_legal_crime" "ef_legal_gender"
## [81] "ef_legal" "ef_money_growth"
## [83] "ef_money_sd" "ef_money_inflation"
## [85] "ef_money_currency" "ef_money"
## [87] "ef_trade_tariffs_revenue" "ef_trade_tariffs_mean"
## [89] "ef_trade_tariffs_sd" "ef_trade_tariffs"
## [91] "ef_trade_regulatory_nontariff" "ef_trade_regulatory_compliance"
## [93] "ef_trade_regulatory" "ef_trade_black"
## [95] "ef_trade_movement_foreign" "ef_trade_movement_capital"
## [97] "ef_trade_movement_visit" "ef_trade_movement"
## [99] "ef_trade" "ef_regulation_credit_ownership"
## [101] "ef_regulation_credit_private" "ef_regulation_credit_interest"
## [103] "ef_regulation_credit" "ef_regulation_labor_minwage"
## [105] "ef_regulation_labor_firing" "ef_regulation_labor_bargain"
## [107] "ef_regulation_labor_hours" "ef_regulation_labor_dismissal"
## [109] "ef_regulation_labor_conscription" "ef_regulation_labor"
## [111] "ef_regulation_business_adm" "ef_regulation_business_bureaucracy"
## [113] "ef_regulation_business_start" "ef_regulation_business_bribes"
## [115] "ef_regulation_business_licensing" "ef_regulation_business_compliance"
## [117] "ef_regulation_business" "ef_regulation"
## [119] "ef_score" "ef_rank"
## [121] "hf_score" "hf_rank"
## [123] "hf_quartile"
The dataset has a lot of variables, but we are only interested in four: pf_score, pf_expression_control, hf_score, year, and region. Select these variables from the hfi dataframe, and assign the result to a dataframe named hfi_small.
Using the code below, how many rows in the hfi_small data have missing values?__ Next, use the drop_na() function to create another new dataframe named hfi_small_clean that does not contain any missing values.
## [1] 1458 5
## [1] 1378 5
80 rows have missing values.
What type of plot would you use to display the relationship between the personal freedom score, pf_score, and pf_expression_control? Plot this relationship using the variable pf_expression_control as the predictor. Does the relationship look linear? If you knew a country’s pf_expression_control, or its score out of 10, with 0 being the most, of political pressures and controls on media content, would you be comfortable using a linear model to predict the personal freedom score?
# Insert code for Exercise 4 here
hfi_small_clean %>%
summarise(cor(pf_expression_control, pf_score))## # A tibble: 1 x 1
## `cor(pf_expression_control, pf_score)`
## <dbl>
## 1 0.796
## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## 4.6171 0.4914
##
## Sum of Squares: 952.153
Yes, I be comfortable using a linear model to predict the personal freedom score because correlation is high.
Looking at your plot from the previous exercise, describe the relationship between these two variables. Make sure to discuss the form, direction, and strength of the relationship as well as any unusual observations.
There appears to be a few outliers. There is also a moderate positive linear relationship between pf_score and pf_expression_control.
Using plot_ss, choose a line that does a good job of minimizing the sum of squares. Run the function several times. What was the smallest sum of squares that you got? How does it compare to your neighbors?
# Insert code for Exercise 6 here
plot_ss(x = pf_expression_control, y = pf_score, data = hfi_small_clean, showSquares = TRUE)## Click two points to make a line.
## Call:
## lm(formula = y ~ x, data = pts)
##
## Coefficients:
## (Intercept) x
## 4.6171 0.4914
##
## Sum of Squares: 952.153
The best result was 952.153.
Fit a new model that uses pf_expression_control to predict hf_score, or the total human freedom score. Using the estimates from the R output, write the equation of the regression line. What does the slope tell us in the context of the relationship between human freedom and the amount of political pressure on media content?
# Insert code for Exercise 7 here
m2 <- lm(hf_score ~ pf_expression_control, data = hfi_small_clean)
summary(m2)##
## Call:
## lm(formula = hf_score ~ pf_expression_control, data = hfi_small_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6198 -0.4908 0.1031 0.4703 2.2933
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.153687 0.046070 111.87 <2e-16 ***
## pf_expression_control 0.349862 0.008067 43.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.667 on 1376 degrees of freedom
## Multiple R-squared: 0.5775, Adjusted R-squared: 0.5772
## F-statistic: 1881 on 1 and 1376 DF, p-value: < 2.2e-16
57.7 % of variation in hf_score can be explained by the regression formula and we have a positive linear relationship.
hf_score = 5.153687 + 0.349862*pf_expression_control
With pf_expression_control increasing by one unit hf_score increases by 0.349862 on average.
The data suggests that as political pressure on media content increases, the total human freedom score also increases significantly.
If someone saw the least squares regression line and not the actual data, how would they predict a country’s personal freedom score for one with a 7.4 rating for pf_expression_control? Is this an overestimate or an underestimate, and by how much? In other words, what is the residual for this prediction?
# Insert code for Exercise 8 here
ggplot(data = hfi_small, aes(x = pf_expression_control, y = pf_score)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 80 rows containing non-finite values (stat_smooth).
## Warning: Removed 80 rows containing missing values (geom_point).
## [1] 8.253667
pf_score2 <- hfi_small %>% filter(pf_expression_control == 7.5) %>% select(pf_score) %>% pull() %>% mean()
pf_score2## [1] 8.438356
pf_score3 <- hfi_small %>% filter(pf_expression_control == 7.25) %>% select(pf_score) %>% pull() %>% mean()
pf_score3## [1] 7.774527
## [1] 8.172824
## [1] -0.08084254
Residual = -0.08084254. Predicted value is larger than observed.
Is there any apparent pattern in the residuals plot? What does this indicate about the linearity of the relationship between the two variables?
# Insert code for Exercise 9 here
ggplot(data = m2, aes(x = .resid)) +
geom_histogram(binwidth = 25) +
xlab("Residuals") There is no obvious pattern in the residuals plot except a very small linearity indictated by the red line in the residual plot.Yes,condition of normality seems to have been met. Conditions for linear regression are satisfied, so we have a strong linear correlation.
Based on the histogram and the normal probability plot, does the nearly normal residuals condition appear to be violated? Why or why not?
# Insert code for Exercise 10 here
ggplot(data = m2, aes(x = .resid)) +
geom_histogram(binwidth = 25) +
xlab("Residuals") No, this is because the conditions for normality appear to be met according to the Q-Q plot.
Based on the residuals vs. fitted plot, does the constant variability condition appear to be violated? Why or why not?
# Insert code for Exercise 11 here
ggplot(data = m2, aes(x = .resid)) +
geom_histogram(binwidth = 25) +
xlab("Residuals") No, this is because there is no obvious patterns.
Choose another freedom variable and a variable you think would strongly correlate with it. Produce a scatterplot of the two variables and fit a linear model. At a glance, does there seem to be a linear relationship? Remember: You’ll need to go back and add that variable into the select() statement!
# Insert code for Exercise 11 here
m3 <- lm(hf_score ~ pf_expression_influence, data = hfi)
summary(m3)##
## Call:
## lm(formula = hf_score ~ pf_expression_influence, data = hfi)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6889 -0.4898 0.1004 0.4824 2.4878
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.49066 0.04114 133.47 <2e-16 ***
## pf_expression_influence 0.28898 0.00706 40.93 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6891 on 1376 degrees of freedom
## (80 observations deleted due to missingness)
## Multiple R-squared: 0.5491, Adjusted R-squared: 0.5488
## F-statistic: 1676 on 1 and 1376 DF, p-value: < 2.2e-16
ggplot(data = hfi, aes(x = pf_expression_influence, y = hf_score)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 80 rows containing non-finite values (stat_smooth).
## Warning: Removed 80 rows containing missing values (geom_point).
No, this is because there is no obvious patterns.