In this project, we explore the infant mortality data from around the world in 1970. It has: - 101 rows - 4 columns (country, income per capita ($), infant mortality, as well as region)

This data can be found in my github dataset repository.

Load the data.

##                      country income infmort   region
## 1                  Australia   3426    26.7     Asia
## 2                    Austria   3350    23.7   Europe
## 3                    Belgium   3346    17.0   Europe
## 4                     Canada   4751    16.8 Americas
## 5                    Denmark   5029    13.5   Europe
## 6                    Finland   3312    10.1   Europe
## 7                     France   3403    12.9   Europe
## 8               West.Germany   5040    20.4   Europe
## 9                    Ireland   2009    17.8   Europe
## 10                     Italy   2298    25.7   Europe
## 11                     Japan   3292    11.7   Europe
## 12               Netherlands   4103    11.6   Europe
## 13               New.Zealand   3723    16.2     Asia
## 14                    Norway   4102    11.3   Europe
## 15                  Portugal    956    44.8   Europe
## 16              South.Africa   1000    71.5   Africa
## 17                    Sweden   5596     9.6   Europe
## 18               Switzerland   2963    12.8   Europe
## 19                   Britain   2503    17.5   Europe
## 20             United.States   5523    17.6 Americas
## 21                   Algeria    400    86.3   Africa
## 22                   Ecuador    250    78.5 Americas
## 23                 Indonesia    110   125.0     Asia
## 24                      Iraq    560    28.1     Asia
## 25                     Libya   3010   300.0   Africa
## 26                   Nigeria    220    58.0   Africa
## 27              Saudi.Arabia   1530   650.0     Asia
## 28                 Venezuela   1240    51.7 Americas
## 29                 Argentina   1191    59.6 Americas
## 30                    Brazil    425   170.0 Americas
## 31                     Chile    590    78.0 Americas
## 32                  Colombia    426    62.8 Americas
## 33                Costa.Rica    725    54.4 Americas
## 34        Dominican.Republic    406    48.8 Americas
## 35                    Greece   1760    27.8   Europe
## 36                 Guatemala    302    79.1 Americas
## 37                    Israel   2526    22.1     Asia
## 38                   Jamaica    727    26.2 Americas
## 39                   Lebanon    631    13.6     Asia
## 40                  Malaysia    295    32.0     Asia
## 41                    Mexico    684    60.9 Americas
## 42                 Nicaragua    507    46.0 Americas
## 43                    Panama    754    34.1 Americas
## 44                      Peru    335    65.1 Americas
## 45                 Singapore   1268    20.4     Asia
## 46                     Spain   1256    15.1   Europe
## 47                    Taiwan    261    19.1     Asia
## 48       Trinidad.and.Tobago    732    26.2 Americas
## 49                   Tunisia    434    76.3   Africa
## 50                   Uruguay    799    40.4 Americas
## 51                Yugoslavia    406    43.3   Europe
## 52                    Zambia    310   259.0   Africa
## 53                   Bolivia    200    60.4 Americas
## 54                  Cameroon    100   137.0   Africa
## 55                     Congo    281   180.0   Africa
## 56                     Egypt    210   114.0   Africa
## 57               El.Salvador    319    58.2 Americas
## 58                     Ghana    217    63.7   Africa
## 59                  Honduras    284    39.3 Americas
## 60               Ivory.Coast    387   138.0   Africa
## 61                    Jordan    334    21.3     Asia
## 62               South.Korea    344    58.0     Asia
## 63                   Liberia    197   159.2   Africa
## 64                    Moroco    279   149.0   Africa
## 65          Papua.New.Guinea    477    10.2     Asia
## 66                  Paraguay    347    38.6 Americas
## 67               Philippines    230    67.9     Asia
## 68                     Syria    334    21.7     Asia
## 69                  Thailand    210    27.0     Asia
## 70                    Turkey    435   153.0     Asia
## 71             South.Vietnam    130   100.0     Asia
## 72                Afganistan     75   400.0     Asia
## 73                Bangladesh    100   124.3     Asia
## 74                     Burma     73   200.0     Asia
## 75                   Burundi     68   150.0   Africa
## 76                  Cambodia    123   100.0     Asia
## 77  Central.African.Republic    122   190.0   Africa
## 78                      Chad     70   160.0   Africa
## 79                   Dahomey     81   109.6   Africa
## 80                  Ethiopia     79    84.2   Africa
## 81                    Guinea     79   216.0   Africa
## 82                     India     93    60.6     Asia
## 83                     Kenya    169    55.0   Africa
## 84                Madagascar    120   102.0   Africa
## 85                    Malawi    130   148.3   Africa
## 86                      Mali     50   120.0   Africa
## 87                Mauritania    174   187.0   Africa
## 88                     Niger     70   200.0   Africa
## 89                  Pakistan    102   124.3     Asia
## 90                    Rwanda     61   132.9   Africa
## 91              Sierra.Leone    148   170.0   Africa
## 92                   Somalia     85   158.0   Africa
## 93                 Sri.Lanka    162    45.1     Asia
## 94                     Sudan    125   129.4   Africa
## 95                  Tanzania    120   162.5   Africa
## 96                      Togo    160   127.0   Africa
## 97                    Uganda    134   160.0   Africa
## 98               Upper.Volta     82   180.0   Africa
## 99            Southern.Yemen     96    80.0     Asia
## 100                    Yemen     77    50.0     Asia
## 101                    Zaire    118   104.0   Africa

####Plot the data

plot(mort)

Make a scatter plot to show the association between infant mortality and income.

par(mar = c(4.5, 4.5, 2, 2), cex = 0.7)
plot(infmort ~ income, xlab = "Income (USD) per Capita",
     ylab = "Infant Mortality Rate", data = mort)

par(mar = c(4.5, 4.5, 2, 2), cex = 0.7)

mod1 <- lm(log(infmort) ~ log(income), data = mort)
plot(log(infmort) ~ log(income), data = mort, residuals(mort))
abline(mod1)

boxplot(mort$infmort,
        ylab = "infant mortality")

Fit a simple linear regression model, with log(infmort) as the response variable, while log(income) being the predictor variable

plot(mod1)

summary(mod1)
## 
## Call:
## lm(formula = log(infmort) ~ log(income), data = mort)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.66694 -0.42779 -0.02649  0.30441  3.08415 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.14582    0.31654  22.575   <2e-16 ***
## log(income) -0.51179    0.05122  -9.992   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6867 on 99 degrees of freedom
## Multiple R-squared:  0.5021, Adjusted R-squared:  0.4971 
## F-statistic: 99.84 on 1 and 99 DF,  p-value: < 2.2e-16
This can also be interpreted as:
  • A 1% increase in income is associated with a 0.512% decrease in infant mortality rate.

Let’s try to obtain a prediction at 95% prediction for log(income), assuming a $300 income.

md <- data.frame(income = 300)

p <- predict(mod1, newdata = md, interval = "prediction")
p
##       fit      lwr      upr
## 1 4.22666 2.856942 5.596379
exp(p)
##        fit      lwr      upr
## 1 68.48813 17.40821 269.4489

Earlier on, we tried to identify the outliers. From previous plot of the residuals, we were able to see the outliers labelled as 25 & 27. Let’s try refit the model and plot again, to see of we could identify the specific countries.

par(mfrow = c(1, 2), mar = c(4.5, 4.5, 2, 2), cex = 0.7)
plot(mod1, c(1:2))

which(abs(rstandard(mod1)) > 3)
## 25 27 
## 25 27
mort[c(25, 27), ]
##         country income infmort region
## 25        Libya   3010     300 Africa
## 27 Saudi.Arabia   1530     650   Asia
mort2 <- mort[-c(25, 25) ]
mod2 <- lm(log(infmort) ~ log(income), data = mort2)
summary(mod2)
## 
## Call:
## lm(formula = log(infmort) ~ log(income), data = mort2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.66694 -0.42779 -0.02649  0.30441  3.08415 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.14582    0.31654  22.575   <2e-16 ***
## log(income) -0.51179    0.05122  -9.992   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6867 on 99 degrees of freedom
## Multiple R-squared:  0.5021, Adjusted R-squared:  0.4971 
## F-statistic: 99.84 on 1 and 99 DF,  p-value: < 2.2e-16

With the outliers out, let’s fit regression model, and also, a partial F-Test.

mod3 <- lm(log(infmort) ~ log(income) + region, data = mort2)
summary(mod3)
## 
## Call:
## lm(formula = log(infmort) ~ log(income) + region, data = mort2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5108 -0.3491 -0.0458  0.2804  2.9928 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     6.40301    0.35829  17.871  < 2e-16 ***
## log(income)    -0.29939    0.06741  -4.441 2.39e-05 ***
## regionAmericas -0.60223    0.19023  -3.166  0.00207 ** 
## regionAsia     -0.72334    0.16324  -4.431 2.49e-05 ***
## regionEurope   -1.20282    0.25881  -4.647 1.07e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6122 on 96 degrees of freedom
## Multiple R-squared:  0.6163, Adjusted R-squared:  0.6003 
## F-statistic: 38.55 on 4 and 96 DF,  p-value: < 2.2e-16
mod4 <- lm(log(infmort) ~ log(income) * region, data = mort2)
summary(mod4)
## 
## Call:
## lm(formula = log(infmort) ~ log(income) * region, data = mort2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.46809 -0.26530 -0.02148  0.27478  3.14219 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  4.9385     0.6362   7.763 1.06e-11 ***
## log(income)                 -0.0112     0.1235  -0.091   0.9280    
## regionAmericas               1.5661     1.1856   1.321   0.1898    
## regionAsia                   1.2634     0.8561   1.476   0.1434    
## regionEurope                 2.0882     1.8422   1.134   0.2599    
## log(income):regionAmericas  -0.3978     0.1979  -2.010   0.0473 *  
## log(income):regionAsia      -0.3798     0.1580  -2.404   0.0182 *  
## log(income):regionEurope    -0.5205     0.2516  -2.069   0.0413 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5971 on 93 degrees of freedom
## Multiple R-squared:  0.6464, Adjusted R-squared:  0.6198 
## F-statistic: 24.29 on 7 and 93 DF,  p-value: < 2.2e-16
anova(mod3, mod4)
## Analysis of Variance Table
## 
## Model 1: log(infmort) ~ log(income) + region
## Model 2: log(infmort) ~ log(income) * region
##   Res.Df    RSS Df Sum of Sq      F  Pr(>F)  
## 1     96 35.980                              
## 2     93 33.152  3    2.8279 2.6444 0.05375 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Comment, corrections and referrals are highly appreciated.