library(tidyverse)
library(openintro)
library(statsr)
## Warning: package 'statsr' was built under R version 4.0.3

Exercise 1

What are the dimensions of the data set?

1458 rows by 123 columns

arbuthnot$girls
##  [1] 4683 4457 4102 4590 4839 4820 4928 4605 4457 4952 4784 5332 5200 4910 4617
## [16] 3997 3919 3395 3536 3181 2746 2722 2840 2908 2959 3179 3349 3382 3289 3013
## [31] 2781 3247 4107 4803 4881 5681 4858 4319 5322 5560 5829 5719 6061 6120 5822
## [46] 5738 5717 5847 6203 6033 6041 6299 6533 6744 7158 7127 7246 7119 7214 7101
## [61] 7167 7302 7392 7316 7483 6647 6713 7229 7767 7626 7452 7061 7514 7656 7683
## [76] 5738 7779 7417 7687 7623 7380 7288
hfi
## # A tibble: 1,458 x 123
##     year ISO_code countries region pf_rol_procedur~ pf_rol_civil pf_rol_criminal
##    <dbl> <chr>    <chr>     <chr>             <dbl>        <dbl>           <dbl>
##  1  2016 ALB      Albania   Easte~             6.66         4.55            4.67
##  2  2016 DZA      Algeria   Middl~            NA           NA              NA   
##  3  2016 AGO      Angola    Sub-S~            NA           NA              NA   
##  4  2016 ARG      Argentina Latin~             7.10         5.79            4.34
##  5  2016 ARM      Armenia   Cauca~            NA           NA              NA   
##  6  2016 AUS      Australia Ocean~             8.44         7.53            7.36
##  7  2016 AUT      Austria   Weste~             8.97         7.87            7.67
##  8  2016 AZE      Azerbaij~ Cauca~            NA           NA              NA   
##  9  2016 BHS      Bahamas   Latin~             6.93         6.01            6.26
## 10  2016 BHR      Bahrain   Middl~            NA           NA              NA   
## # ... with 1,448 more rows, and 116 more variables: pf_rol <dbl>,
## #   pf_ss_homicide <dbl>, pf_ss_disappearances_disap <dbl>,
## #   pf_ss_disappearances_violent <dbl>, pf_ss_disappearances_organized <dbl>,
## #   pf_ss_disappearances_fatalities <dbl>, pf_ss_disappearances_injuries <dbl>,
## #   pf_ss_disappearances <dbl>, pf_ss_women_fgm <dbl>,
## #   pf_ss_women_missing <dbl>, pf_ss_women_inheritance_widows <dbl>,
## #   pf_ss_women_inheritance_daughters <dbl>, pf_ss_women_inheritance <dbl>,
## #   pf_ss_women <dbl>, pf_ss <dbl>, pf_movement_domestic <dbl>,
## #   pf_movement_foreign <dbl>, pf_movement_women <dbl>, pf_movement <dbl>,
## #   pf_religion_estop_establish <dbl>, pf_religion_estop_operate <dbl>,
## #   pf_religion_estop <dbl>, pf_religion_harassment <dbl>,
## #   pf_religion_restrictions <dbl>, pf_religion <dbl>,
## #   pf_association_association <dbl>, pf_association_assembly <dbl>,
## #   pf_association_political_establish <dbl>,
## #   pf_association_political_operate <dbl>, pf_association_political <dbl>,
## #   pf_association_prof_establish <dbl>, pf_association_prof_operate <dbl>,
## #   pf_association_prof <dbl>, pf_association_sport_establish <dbl>,
## #   pf_association_sport_operate <dbl>, pf_association_sport <dbl>,
## #   pf_association <dbl>, pf_expression_killed <dbl>,
## #   pf_expression_jailed <dbl>, pf_expression_influence <dbl>,
## #   pf_expression_control <dbl>, pf_expression_cable <dbl>,
## #   pf_expression_newspapers <dbl>, pf_expression_internet <dbl>,
## #   pf_expression <dbl>, pf_identity_legal <dbl>,
## #   pf_identity_parental_marriage <dbl>, pf_identity_parental_divorce <dbl>,
## #   pf_identity_parental <dbl>, pf_identity_sex_male <dbl>,
## #   pf_identity_sex_female <dbl>, pf_identity_sex <dbl>,
## #   pf_identity_divorce <dbl>, pf_identity <dbl>, pf_score <dbl>,
## #   pf_rank <dbl>, ef_government_consumption <dbl>,
## #   ef_government_transfers <dbl>, ef_government_enterprises <dbl>,
## #   ef_government_tax_income <dbl>, ef_government_tax_payroll <dbl>,
## #   ef_government_tax <dbl>, ef_government <dbl>, ef_legal_judicial <dbl>,
## #   ef_legal_courts <dbl>, ef_legal_protection <dbl>, ef_legal_military <dbl>,
## #   ef_legal_integrity <dbl>, ef_legal_enforcement <dbl>,
## #   ef_legal_restrictions <dbl>, ef_legal_police <dbl>, ef_legal_crime <dbl>,
## #   ef_legal_gender <dbl>, ef_legal <dbl>, ef_money_growth <dbl>,
## #   ef_money_sd <dbl>, ef_money_inflation <dbl>, ef_money_currency <dbl>,
## #   ef_money <dbl>, ef_trade_tariffs_revenue <dbl>,
## #   ef_trade_tariffs_mean <dbl>, ef_trade_tariffs_sd <dbl>,
## #   ef_trade_tariffs <dbl>, ef_trade_regulatory_nontariff <dbl>,
## #   ef_trade_regulatory_compliance <dbl>, ef_trade_regulatory <dbl>,
## #   ef_trade_black <dbl>, ef_trade_movement_foreign <dbl>,
## #   ef_trade_movement_capital <dbl>, ef_trade_movement_visit <dbl>,
## #   ef_trade_movement <dbl>, ef_trade <dbl>,
## #   ef_regulation_credit_ownership <dbl>, ef_regulation_credit_private <dbl>,
## #   ef_regulation_credit_interest <dbl>, ef_regulation_credit <dbl>,
## #   ef_regulation_labor_minwage <dbl>, ef_regulation_labor_firing <dbl>,
## #   ef_regulation_labor_bargain <dbl>, ef_regulation_labor_hours <dbl>, ...
dim(hfi)
## [1] 1458  123
names(hfi)
##   [1] "year"                               "ISO_code"                          
##   [3] "countries"                          "region"                            
##   [5] "pf_rol_procedural"                  "pf_rol_civil"                      
##   [7] "pf_rol_criminal"                    "pf_rol"                            
##   [9] "pf_ss_homicide"                     "pf_ss_disappearances_disap"        
##  [11] "pf_ss_disappearances_violent"       "pf_ss_disappearances_organized"    
##  [13] "pf_ss_disappearances_fatalities"    "pf_ss_disappearances_injuries"     
##  [15] "pf_ss_disappearances"               "pf_ss_women_fgm"                   
##  [17] "pf_ss_women_missing"                "pf_ss_women_inheritance_widows"    
##  [19] "pf_ss_women_inheritance_daughters"  "pf_ss_women_inheritance"           
##  [21] "pf_ss_women"                        "pf_ss"                             
##  [23] "pf_movement_domestic"               "pf_movement_foreign"               
##  [25] "pf_movement_women"                  "pf_movement"                       
##  [27] "pf_religion_estop_establish"        "pf_religion_estop_operate"         
##  [29] "pf_religion_estop"                  "pf_religion_harassment"            
##  [31] "pf_religion_restrictions"           "pf_religion"                       
##  [33] "pf_association_association"         "pf_association_assembly"           
##  [35] "pf_association_political_establish" "pf_association_political_operate"  
##  [37] "pf_association_political"           "pf_association_prof_establish"     
##  [39] "pf_association_prof_operate"        "pf_association_prof"               
##  [41] "pf_association_sport_establish"     "pf_association_sport_operate"      
##  [43] "pf_association_sport"               "pf_association"                    
##  [45] "pf_expression_killed"               "pf_expression_jailed"              
##  [47] "pf_expression_influence"            "pf_expression_control"             
##  [49] "pf_expression_cable"                "pf_expression_newspapers"          
##  [51] "pf_expression_internet"             "pf_expression"                     
##  [53] "pf_identity_legal"                  "pf_identity_parental_marriage"     
##  [55] "pf_identity_parental_divorce"       "pf_identity_parental"              
##  [57] "pf_identity_sex_male"               "pf_identity_sex_female"            
##  [59] "pf_identity_sex"                    "pf_identity_divorce"               
##  [61] "pf_identity"                        "pf_score"                          
##  [63] "pf_rank"                            "ef_government_consumption"         
##  [65] "ef_government_transfers"            "ef_government_enterprises"         
##  [67] "ef_government_tax_income"           "ef_government_tax_payroll"         
##  [69] "ef_government_tax"                  "ef_government"                     
##  [71] "ef_legal_judicial"                  "ef_legal_courts"                   
##  [73] "ef_legal_protection"                "ef_legal_military"                 
##  [75] "ef_legal_integrity"                 "ef_legal_enforcement"              
##  [77] "ef_legal_restrictions"              "ef_legal_police"                   
##  [79] "ef_legal_crime"                     "ef_legal_gender"                   
##  [81] "ef_legal"                           "ef_money_growth"                   
##  [83] "ef_money_sd"                        "ef_money_inflation"                
##  [85] "ef_money_currency"                  "ef_money"                          
##  [87] "ef_trade_tariffs_revenue"           "ef_trade_tariffs_mean"             
##  [89] "ef_trade_tariffs_sd"                "ef_trade_tariffs"                  
##  [91] "ef_trade_regulatory_nontariff"      "ef_trade_regulatory_compliance"    
##  [93] "ef_trade_regulatory"                "ef_trade_black"                    
##  [95] "ef_trade_movement_foreign"          "ef_trade_movement_capital"         
##  [97] "ef_trade_movement_visit"            "ef_trade_movement"                 
##  [99] "ef_trade"                           "ef_regulation_credit_ownership"    
## [101] "ef_regulation_credit_private"       "ef_regulation_credit_interest"     
## [103] "ef_regulation_credit"               "ef_regulation_labor_minwage"       
## [105] "ef_regulation_labor_firing"         "ef_regulation_labor_bargain"       
## [107] "ef_regulation_labor_hours"          "ef_regulation_labor_dismissal"     
## [109] "ef_regulation_labor_conscription"   "ef_regulation_labor"               
## [111] "ef_regulation_business_adm"         "ef_regulation_business_bureaucracy"
## [113] "ef_regulation_business_start"       "ef_regulation_business_bribes"     
## [115] "ef_regulation_business_licensing"   "ef_regulation_business_compliance" 
## [117] "ef_regulation_business"             "ef_regulation"                     
## [119] "ef_score"                           "ef_rank"                           
## [121] "hf_score"                           "hf_rank"                           
## [123] "hf_quartile"

Exercise 2

The dataset has a lot of variables, but we are only interested in four: pf_score, pf_expression_control, hf_score, year, and region. Select these variables from the hfi dataframe, and assign the result to a dataframe named hfi_small.

# Insert code for Exercise 2 here

hfi_small <- hfi[,c(1,4,48,62,121)]
#pf_score (62), pf_expression_control (48), hf_score (121), year (1),  region (4)
View(hfi_small)

Exercise 3

Using the code below, how many rows in the hfi_small data have missing values?__ Next, use the drop_na() function to created another new dataframe named hfi_small_clean that does not contain any missing values.

# Insert code for Exercise 3 here
dim(hfi_small)
## [1] 1458    5
drop_na(hfi_small) %>% dim()
## [1] 1378    5

80 rows have missing values.

Exercise 4

What type of plot would you use to display the relationship between the personal freedom score, pf_score, and pf_expression_control? Plot this relationship using the variable pf_expression_control as the predictor. Does the relationship look linear? If you knew a country’s pf_expression_control, or its score out of 10, with 0 being the most, of political pressures and controls on media content, would you be comfortable using a linear model to predict the personal freedom score?

# Insert code for Exercise 4 here

hfi_small <- drop_na(hfi_small)

hfi_small %>%
  summarise(cor(pf_expression_control, pf_score))
## # A tibble: 1 x 1
##   `cor(pf_expression_control, pf_score)`
##                                    <dbl>
## 1                                  0.796

Yes, I be comfortable using a linear model to predict the personal freedom score because correlation is high.

Exercise 5

Looking at your plot from the previous exercise, describe the relationship between these two variables. Make sure to discuss the form, direction, and strength of the relationship as well as any unusual observations.

# Insert code for Exercise 5 here

There appears to be a few outliers. It is also a positive linear relationship.

Exercise 6

Using plot_ss, choose a line that does a good job of minimizing the sum of squares. Run the function several times. What was the smallest sum of squares that you got? How does it compare to your neighbors?

# Insert code for Exercise 6 here 
plot_ss(x = pf_expression_control, y = pf_score, data = hfi_small, showSquares = TRUE)

## Click two points to make a line.
                                
## Call:
## lm(formula = y ~ x, data = pts)
## 
## Coefficients:
## (Intercept)            x  
##      4.6171       0.4914  
## 
## Sum of Squares:  952.153

I ran the code several times and the best result was 952 which is close to other attempts I ran.

Exercise 7

Fit a new model that uses pf_expression_control to predict hf_score, or the total human freedom score. Using the estimates from the R output, write the equation of the regression line. What does the slope tell us in the context of the relationship between human freedom and the amount of political pressure on media content?

# Insert code for Exercise 7 here
m2 <- lm(hf_score ~ pf_expression_control, data = hfi_small)
summary(m2)
## 
## Call:
## lm(formula = hf_score ~ pf_expression_control, data = hfi_small)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6198 -0.4908  0.1031  0.4703  2.2933 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           5.153687   0.046070  111.87   <2e-16 ***
## pf_expression_control 0.349862   0.008067   43.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.667 on 1376 degrees of freedom
## Multiple R-squared:  0.5775, Adjusted R-squared:  0.5772 
## F-statistic:  1881 on 1 and 1376 DF,  p-value: < 2.2e-16

57.7 % of variation in hf_score can be explained by the regression formula and we have a positive linear relationship.

hf_score = 5.153687 + 0.349862*pf_expression_control

With pf_expression_control increasing by one unit hf_score increases by 0.349862.

Exercise 8

If someone saw the least squares regression line and not the actual data, how would they predict a country’s personal freedom school for one with a 7.4 rating for pf_expression_control? Is this an overestimate or an underestimate, and by how much? In other words, what is the residual for this prediction?

# Insert code for Exercise 8 here
ggplot(data = hfi_small, aes(x = pf_expression_control, y = pf_score)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'

pf_expression_control1 = 7.4

pf_score1 = 4.61707 + 0.491432*pf_expression_control1
pf_score1
## [1] 8.253667
pf_score2 <- hfi_small %>% filter(pf_expression_control == 7.5) %>% select(pf_score) %>% pull() %>% mean()
pf_score2
## [1] 8.438356
pf_score3 <- hfi_small %>% filter(pf_expression_control == 7.25) %>% select(pf_score) %>% pull() %>% mean()
pf_score3
## [1] 7.774527
pf_score4 = 0.6*(pf_score2 - pf_score3) + pf_score3
pf_score4
## [1] 8.172824
pf_score4 -pf_score1 
## [1] -0.08084254

Residual = -0.08084254. Predicted value is larger than observed.

Exercise 9

Is there any apparent pattern in the residuals plot? What does this indicate about the linearity of the relationship between the two variables?

# Insert code for Exercise 9 here
ggplot(data = m2, aes(x = .resid)) +
  geom_histogram(binwidth = 25) +
  xlab("Residuals")

ggplot(data = m2, aes(sample = .resid)) +
  stat_qq()

plot(m2)

There is no obvious pattern in the residuals plot.Yes,condition of normality seems to have been met.Conditions for linear regression are satisfied, so we have a strong linear correlation.

Exercise 10

Based on the histogram and the normal probability plot, does the nearly normal residuals condition appear to be violated? Why or why not?

# Insert code for Exercise 10 here
ggplot(data = m2, aes(x = .resid)) +
  geom_histogram(binwidth = 25) +
  xlab("Residuals")

ggplot(data = m2, aes(sample = .resid)) +
  stat_qq()

plot(m2)

No, this is because the conditions for normality appear to be met.

Exercise 11

Based on the residuals vs. fitted plot, does the constant variability condition appear to be violated? Why or why not?

# Insert code for Exercise 11 here
ggplot(data = m2, aes(x = .resid)) +
  geom_histogram(binwidth = 25) +
  xlab("Residuals")

ggplot(data = m2, aes(sample = .resid)) +
  stat_qq()

plot(m2)

No, this is because there is no obvious patterns.

More Practice 1

Choose another freedom variable and a variable you think would strongly correlate with it. Produce a scatterplot of the two variables and fit a linear model. At a glance, does there seem to be a linear relationship? Remember: You’ll need to go back and add that variable into the select() statement!

# Insert code for Exercise 11 here
m3 <- lm(hf_score ~ pf_expression_influence, data = hfi)
summary(m3)
## 
## Call:
## lm(formula = hf_score ~ pf_expression_influence, data = hfi)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6889 -0.4898  0.1004  0.4824  2.4878 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              5.49066    0.04114  133.47   <2e-16 ***
## pf_expression_influence  0.28898    0.00706   40.93   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6891 on 1376 degrees of freedom
##   (80 observations deleted due to missingness)
## Multiple R-squared:  0.5491, Adjusted R-squared:  0.5488 
## F-statistic:  1676 on 1 and 1376 DF,  p-value: < 2.2e-16
plot(m3)

ggplot(data = hfi, aes(x = pf_expression_influence, y = hf_score)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 80 rows containing non-finite values (stat_smooth).
## Warning: Removed 80 rows containing missing values (geom_point).

No, this is because there is no obvious patterns.

