library(tidyverse)
library(openintro)
data('hfi', package='openintro')

Exercise 1

The dimensions of the dataset are 1,458 rows by 123 columns.

dim(hfi)
## [1] 1458  123

Exercise 2

The appropriate plot would be a scatter plot. The relationship looks linear, so I would be comfortable using a linear regression here.

ggplot(hfi, aes(x=pf_score, y=pf_expression_control)) +
  geom_point()
## Warning: Removed 80 rows containing missing values (geom_point).

Exercise 3

The plot shows that the relationship looks positively linear. And, after calculating the correlation we know there is a faily strong correlation between the 2 variables.

hfi %>%
  summarise(cor(pf_expression_control, pf_score, use = "complete.obs"))
## # A tibble: 1 x 1
##   `cor(pf_expression_control, pf_score, use = "complete.obs")`
##                                                          <dbl>
## 1                                                        0.796
hfi_n <- drop_na(hfi[c('pf_score','pf_expression_control')])
DATA606::plot_ss(x = hfi_n$pf_expression_control, y = hfi_n$pf_score)

## Click two points to make a line.
                                
## Call:
## lm(formula = y ~ x, data = pts)
## 
## Coefficients:
## (Intercept)            x  
##      4.6171       0.4914  
## 
## Sum of Squares:  952.153

Exercise 4

The sum of squares I got was 952.

DATA606::plot_ss(x = hfi_n$pf_expression_control, y = hfi_n$pf_score, showSquares = TRUE)

## Click two points to make a line.
                                
## Call:
## lm(formula = y ~ x, data = pts)
## 
## Coefficients:
## (Intercept)            x  
##      4.6171       0.4914  
## 
## Sum of Squares:  952.153

Exercise 5

Modeling creating a linear model to predict hf_score gives us the function: hf_score = 5.15 + 0.35*pf_expression_control

m1 <- lm(pf_score ~ pf_expression_control, data = hfi)
summary(m1)
## 
## Call:
## lm(formula = pf_score ~ pf_expression_control, data = hfi)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.8467 -0.5704  0.1452  0.6066  3.2060 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            4.61707    0.05745   80.36   <2e-16 ***
## pf_expression_control  0.49143    0.01006   48.85   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8318 on 1376 degrees of freedom
##   (80 observations deleted due to missingness)
## Multiple R-squared:  0.6342, Adjusted R-squared:  0.634 
## F-statistic:  2386 on 1 and 1376 DF,  p-value: < 2.2e-16
m2 <- lm(hf_score ~ pf_expression_control, data = hfi)
summary(m2)
## 
## Call:
## lm(formula = hf_score ~ pf_expression_control, data = hfi)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6198 -0.4908  0.1031  0.4703  2.2933 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           5.153687   0.046070  111.87   <2e-16 ***
## pf_expression_control 0.349862   0.008067   43.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.667 on 1376 degrees of freedom
##   (80 observations deleted due to missingness)
## Multiple R-squared:  0.5775, Adjusted R-squared:  0.5772 
## F-statistic:  1881 on 1 and 1376 DF,  p-value: < 2.2e-16

Exercise 6

With our regression we would expect a personal freedom score of 7.9.

ggplot(data = hfi, aes(x = pf_expression_control, y = pf_score)) +
  geom_point() +
  stat_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 80 rows containing non-finite values (stat_smooth).
## Warning: Removed 80 rows containing missing values (geom_point).

4.61707 + 0.49143*6.7
## [1] 7.909651

Exercise 7

The residual plot looks randomly distributed, so we can assume a linear regression will fit this data set.

ggplot(data = m1, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  xlab("Fitted values") +
  ylab("Residuals")

Exercise 8

The histogram and probability plot of residuals shows a normal distribution. So our normal condition is met.

ggplot(data = m1, aes(x = .resid)) +
  geom_histogram(binwidth = 0.5) +
  xlab("Residuals")

ggplot(data = m1, aes(sample = .resid)) +
  stat_qq()

Exercise 9

Yes, the constant variability condition is met. The variance is constant in the positive and negative directions across the x-axis.

Excercise 10

I chose pf_religion restrictions and pf_relgion. Based on the names of the fields I thought they might be related. After plotting a scatter plot it looks as the there is a linear rleationship between the two values.

head(hfi)
## # A tibble: 6 x 123
##    year ISO_code countries region pf_rol_procedur… pf_rol_civil pf_rol_criminal
##   <dbl> <chr>    <chr>     <chr>             <dbl>        <dbl>           <dbl>
## 1  2016 ALB      Albania   Easte…             6.66         4.55            4.67
## 2  2016 DZA      Algeria   Middl…            NA           NA              NA   
## 3  2016 AGO      Angola    Sub-S…            NA           NA              NA   
## 4  2016 ARG      Argentina Latin…             7.10         5.79            4.34
## 5  2016 ARM      Armenia   Cauca…            NA           NA              NA   
## 6  2016 AUS      Australia Ocean…             8.44         7.53            7.36
## # … with 116 more variables: pf_rol <dbl>, pf_ss_homicide <dbl>,
## #   pf_ss_disappearances_disap <dbl>, pf_ss_disappearances_violent <dbl>,
## #   pf_ss_disappearances_organized <dbl>,
## #   pf_ss_disappearances_fatalities <dbl>, pf_ss_disappearances_injuries <dbl>,
## #   pf_ss_disappearances <dbl>, pf_ss_women_fgm <dbl>,
## #   pf_ss_women_missing <dbl>, pf_ss_women_inheritance_widows <dbl>,
## #   pf_ss_women_inheritance_daughters <dbl>, pf_ss_women_inheritance <dbl>,
## #   pf_ss_women <dbl>, pf_ss <dbl>, pf_movement_domestic <dbl>,
## #   pf_movement_foreign <dbl>, pf_movement_women <dbl>, pf_movement <dbl>,
## #   pf_religion_estop_establish <dbl>, pf_religion_estop_operate <dbl>,
## #   pf_religion_estop <dbl>, pf_religion_harassment <dbl>,
## #   pf_religion_restrictions <dbl>, pf_religion <dbl>,
## #   pf_association_association <dbl>, pf_association_assembly <dbl>,
## #   pf_association_political_establish <dbl>,
## #   pf_association_political_operate <dbl>, pf_association_political <dbl>,
## #   pf_association_prof_establish <dbl>, pf_association_prof_operate <dbl>,
## #   pf_association_prof <dbl>, pf_association_sport_establish <dbl>,
## #   pf_association_sport_operate <dbl>, pf_association_sport <dbl>,
## #   pf_association <dbl>, pf_expression_killed <dbl>,
## #   pf_expression_jailed <dbl>, pf_expression_influence <dbl>,
## #   pf_expression_control <dbl>, pf_expression_cable <dbl>,
## #   pf_expression_newspapers <dbl>, pf_expression_internet <dbl>,
## #   pf_expression <dbl>, pf_identity_legal <dbl>,
## #   pf_identity_parental_marriage <dbl>, pf_identity_parental_divorce <dbl>,
## #   pf_identity_parental <dbl>, pf_identity_sex_male <dbl>,
## #   pf_identity_sex_female <dbl>, pf_identity_sex <dbl>,
## #   pf_identity_divorce <dbl>, pf_identity <dbl>, pf_score <dbl>,
## #   pf_rank <dbl>, ef_government_consumption <dbl>,
## #   ef_government_transfers <dbl>, ef_government_enterprises <dbl>,
## #   ef_government_tax_income <dbl>, ef_government_tax_payroll <dbl>,
## #   ef_government_tax <dbl>, ef_government <dbl>, ef_legal_judicial <dbl>,
## #   ef_legal_courts <dbl>, ef_legal_protection <dbl>, ef_legal_military <dbl>,
## #   ef_legal_integrity <dbl>, ef_legal_enforcement <dbl>,
## #   ef_legal_restrictions <dbl>, ef_legal_police <dbl>, ef_legal_crime <dbl>,
## #   ef_legal_gender <dbl>, ef_legal <dbl>, ef_money_growth <dbl>,
## #   ef_money_sd <dbl>, ef_money_inflation <dbl>, ef_money_currency <dbl>,
## #   ef_money <dbl>, ef_trade_tariffs_revenue <dbl>,
## #   ef_trade_tariffs_mean <dbl>, ef_trade_tariffs_sd <dbl>,
## #   ef_trade_tariffs <dbl>, ef_trade_regulatory_nontariff <dbl>,
## #   ef_trade_regulatory_compliance <dbl>, ef_trade_regulatory <dbl>,
## #   ef_trade_black <dbl>, ef_trade_movement_foreign <dbl>,
## #   ef_trade_movement_capital <dbl>, ef_trade_movement_visit <dbl>,
## #   ef_trade_movement <dbl>, ef_trade <dbl>,
## #   ef_regulation_credit_ownership <dbl>, ef_regulation_credit_private <dbl>,
## #   ef_regulation_credit_interest <dbl>, ef_regulation_credit <dbl>,
## #   ef_regulation_labor_minwage <dbl>, ef_regulation_labor_firing <dbl>,
## #   ef_regulation_labor_bargain <dbl>, ef_regulation_labor_hours <dbl>, …
hfi_1 <- drop_na(hfi[c('pf_religion_restrictions','pf_religion')])
DATA606::plot_ss(x = hfi_1$pf_religion_restrictions, y = hfi_1$pf_religion)

## Click two points to make a line.
                                
## Call:
## lm(formula = y ~ x, data = pts)
## 
## Coefficients:
## (Intercept)            x  
##      3.6265       0.5847  
## 
## Sum of Squares:  931.005

Exercise 11

The R^2 value in my new model is 0.5941 compared to 0.634 in the original model. Since our previous model has an R-squared higher it is a better predicter.

m3 <- lm(pf_religion_restrictions ~ pf_religion, data = hfi_1)
summary(m3)
## 
## Call:
## lm(formula = pf_religion_restrictions ~ pf_religion, data = hfi_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.4408 -0.6859  0.0137  0.6571  3.9790 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.72688    0.18202  -3.993 6.86e-05 ***
## pf_religion  1.01659    0.02275  44.680  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.09 on 1362 degrees of freedom
## Multiple R-squared:  0.5944, Adjusted R-squared:  0.5941 
## F-statistic:  1996 on 1 and 1362 DF,  p-value: < 2.2e-16
LS0tCnRpdGxlOiAiTGFiIE5hbWUiCmF1dGhvcjogIkF1dGhvciBOYW1lIgpkYXRlOiAiYHIgU3lzLkRhdGUoKWAiCm91dHB1dDogb3BlbmludHJvOjpsYWJfcmVwb3J0Ci0tLQoKYGBge3IgbG9hZC1wYWNrYWdlcywgbWVzc2FnZT1GQUxTRX0KbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkob3BlbmludHJvKQpkYXRhKCdoZmknLCBwYWNrYWdlPSdvcGVuaW50cm8nKQpgYGAKCiMjIyBFeGVyY2lzZSAxCgpUaGUgZGltZW5zaW9ucyBvZiB0aGUgZGF0YXNldCBhcmUgMSw0NTggcm93cyBieSAxMjMgY29sdW1ucy4gCgpgYGB7ciBjb2RlLWNodW5rLWxhYmVsfQpkaW0oaGZpKQpgYGAKCiMjIyBFeGVyY2lzZSAyCgpUaGUgYXBwcm9wcmlhdGUgcGxvdCB3b3VsZCBiZSBhIHNjYXR0ZXIgcGxvdC4gVGhlIHJlbGF0aW9uc2hpcCBsb29rcyBsaW5lYXIsIHNvIEkgd291bGQgYmUgY29tZm9ydGFibGUgdXNpbmcgYSBsaW5lYXIgcmVncmVzc2lvbiBoZXJlLiAKCmBgYHtyfQpnZ3Bsb3QoaGZpLCBhZXMoeD1wZl9zY29yZSwgeT1wZl9leHByZXNzaW9uX2NvbnRyb2wpKSArCiAgZ2VvbV9wb2ludCgpCiAgCmBgYAoKIyMjIEV4ZXJjaXNlIDMKClRoZSBwbG90IHNob3dzIHRoYXQgdGhlIHJlbGF0aW9uc2hpcCBsb29rcyBwb3NpdGl2ZWx5IGxpbmVhci4gQW5kLCBhZnRlciBjYWxjdWxhdGluZyB0aGUgY29ycmVsYXRpb24gd2Uga25vdyB0aGVyZSBpcyBhIGZhaWx5IHN0cm9uZyBjb3JyZWxhdGlvbiBiZXR3ZWVuIHRoZSAyIHZhcmlhYmxlcy4KCmBgYHtyfQpoZmkgJT4lCiAgc3VtbWFyaXNlKGNvcihwZl9leHByZXNzaW9uX2NvbnRyb2wsIHBmX3Njb3JlLCB1c2UgPSAiY29tcGxldGUub2JzIikpCmBgYAoKYGBge3J9CmhmaV9uIDwtIGRyb3BfbmEoaGZpW2MoJ3BmX3Njb3JlJywncGZfZXhwcmVzc2lvbl9jb250cm9sJyldKQpEQVRBNjA2OjpwbG90X3NzKHggPSBoZmlfbiRwZl9leHByZXNzaW9uX2NvbnRyb2wsIHkgPSBoZmlfbiRwZl9zY29yZSkKYGBgCgojIyMgRXhlcmNpc2UgNAoKVGhlIHN1bSBvZiBzcXVhcmVzIEkgZ290IHdhcyA5NTIuCgpgYGB7cn0KREFUQTYwNjo6cGxvdF9zcyh4ID0gaGZpX24kcGZfZXhwcmVzc2lvbl9jb250cm9sLCB5ID0gaGZpX24kcGZfc2NvcmUsIHNob3dTcXVhcmVzID0gVFJVRSkKYGBgCgojIyMgRXhlcmNpc2UgNQoKTW9kZWxpbmcgY3JlYXRpbmcgYSBsaW5lYXIgbW9kZWwgdG8gcHJlZGljdCBoZl9zY29yZSBnaXZlcyB1cyB0aGUgZnVuY3Rpb246IGhmX3Njb3JlID0gNS4xNSArIDAuMzUqcGZfZXhwcmVzc2lvbl9jb250cm9sCgpgYGB7cn0KbTEgPC0gbG0ocGZfc2NvcmUgfiBwZl9leHByZXNzaW9uX2NvbnRyb2wsIGRhdGEgPSBoZmkpCnN1bW1hcnkobTEpCmBgYAoKYGBge3J9Cm0yIDwtIGxtKGhmX3Njb3JlIH4gcGZfZXhwcmVzc2lvbl9jb250cm9sLCBkYXRhID0gaGZpKQpzdW1tYXJ5KG0yKQpgYGAKCiMjIyBFeGVyY2lzZSA2CgpXaXRoIG91ciByZWdyZXNzaW9uIHdlIHdvdWxkIGV4cGVjdCBhIHBlcnNvbmFsIGZyZWVkb20gc2NvcmUgb2YgNy45LgoKYGBge3J9CmdncGxvdChkYXRhID0gaGZpLCBhZXMoeCA9IHBmX2V4cHJlc3Npb25fY29udHJvbCwgeSA9IHBmX3Njb3JlKSkgKwogIGdlb21fcG9pbnQoKSArCiAgc3RhdF9zbW9vdGgobWV0aG9kID0gImxtIiwgc2UgPSBGQUxTRSkKYGBgCgpgYGB7cn0KNC42MTcwNyArIDAuNDkxNDMqNi43CmBgYAoKIyMjIEV4ZXJjaXNlIDcKClRoZSByZXNpZHVhbCBwbG90IGxvb2tzIHJhbmRvbWx5IGRpc3RyaWJ1dGVkLCBzbyB3ZSBjYW4gYXNzdW1lIGEgbGluZWFyIHJlZ3Jlc3Npb24gd2lsbCBmaXQgdGhpcyBkYXRhIHNldC4gCgpgYGB7cn0KZ2dwbG90KGRhdGEgPSBtMSwgYWVzKHggPSAuZml0dGVkLCB5ID0gLnJlc2lkKSkgKwogIGdlb21fcG9pbnQoKSArCiAgZ2VvbV9obGluZSh5aW50ZXJjZXB0ID0gMCwgbGluZXR5cGUgPSAiZGFzaGVkIikgKwogIHhsYWIoIkZpdHRlZCB2YWx1ZXMiKSArCiAgeWxhYigiUmVzaWR1YWxzIikKYGBgCgojIyMgRXhlcmNpc2UgOAoKVGhlIGhpc3RvZ3JhbSBhbmQgcHJvYmFiaWxpdHkgcGxvdCBvZiByZXNpZHVhbHMgc2hvd3MgYSBub3JtYWwgZGlzdHJpYnV0aW9uLiBTbyBvdXIgbm9ybWFsIGNvbmRpdGlvbiBpcyBtZXQuIAoKYGBge3J9CmdncGxvdChkYXRhID0gbTEsIGFlcyh4ID0gLnJlc2lkKSkgKwogIGdlb21faGlzdG9ncmFtKGJpbndpZHRoID0gMC41KSArCiAgeGxhYigiUmVzaWR1YWxzIikKYGBgCgpgYGB7cn0KZ2dwbG90KGRhdGEgPSBtMSwgYWVzKHNhbXBsZSA9IC5yZXNpZCkpICsKICBzdGF0X3FxKCkKYGBgCgojIyMgRXhlcmNpc2UgOQoKWWVzLCB0aGUgY29uc3RhbnQgdmFyaWFiaWxpdHkgY29uZGl0aW9uIGlzIG1ldC4gVGhlIHZhcmlhbmNlIGlzIGNvbnN0YW50IGluIHRoZSBwb3NpdGl2ZSBhbmQgbmVnYXRpdmUgZGlyZWN0aW9ucyBhY3Jvc3MgdGhlIHgtYXhpcy4KCiMjIyBFeGNlcmNpc2UgMTAKCkkgY2hvc2UgcGZfcmVsaWdpb24gcmVzdHJpY3Rpb25zIGFuZCBwZl9yZWxnaW9uLiBCYXNlZCBvbiB0aGUgbmFtZXMgb2YgdGhlIGZpZWxkcyBJIHRob3VnaHQgdGhleSBtaWdodCBiZSByZWxhdGVkLiBBZnRlciBwbG90dGluZyBhIHNjYXR0ZXIgcGxvdCBpdCBsb29rcyBhcyB0aGUgdGhlcmUgaXMgYSBsaW5lYXIgcmxlYXRpb25zaGlwIGJldHdlZW4gdGhlIHR3byB2YWx1ZXMuCgpgYGB7cn0KaGVhZChoZmkpCgpoZmlfMSA8LSBkcm9wX25hKGhmaVtjKCdwZl9yZWxpZ2lvbl9yZXN0cmljdGlvbnMnLCdwZl9yZWxpZ2lvbicpXSkKREFUQTYwNjo6cGxvdF9zcyh4ID0gaGZpXzEkcGZfcmVsaWdpb25fcmVzdHJpY3Rpb25zLCB5ID0gaGZpXzEkcGZfcmVsaWdpb24pCmBgYAoKIyMjIEV4ZXJjaXNlIDExCgpUaGUgUl4yIHZhbHVlIGluIG15IG5ldyBtb2RlbCBpcyAwLjU5NDEgY29tcGFyZWQgdG8gMC42MzQgaW4gdGhlIG9yaWdpbmFsIG1vZGVsLiBTaW5jZSBvdXIgcHJldmlvdXMgbW9kZWwgaGFzIGFuIFItc3F1YXJlZCBoaWdoZXIgaXQgaXMgYSBiZXR0ZXIgcHJlZGljdGVyLgoKYGBge3J9Cm0zIDwtIGxtKHBmX3JlbGlnaW9uX3Jlc3RyaWN0aW9ucyB+IHBmX3JlbGlnaW9uLCBkYXRhID0gaGZpXzEpCnN1bW1hcnkobTMpCmBgYAo=