Problem set

There is a data set ‘countries.csv’ that contains information on 60 countries of the world and their characteristics:

Data set “Countries”
variable	label	possible_range
country	country name	NA
GDP.ppp.pc.us	GDP PPP per capita in US dollars	0+
Int.users.per.100	Internet users per 100	0-100
Status	Freedom House status (free, partly free, or unfree)	Free, Partly free, Unfree
democracy	binary Przeworski index of democracy (democracy or dictatorship)	Yes, No
P4	Polity IV index (ordinal scale from -10, autocracy, to +10, full democracy)	[-10; 10]

df <- read.csv("countries.csv", header = T)
df$P4 <- as.factor(df$P4)
df <- df[,c(-1, -8, -9)]
labs <- c("Country name",
          "GDP per capita, PPP",
          "Internet per 100",
          "Freedom House Index",
          "Democracy-Dictatorship",
          "Polity IV")
library(sjlabelled)
df <- set_label(df, label = labs)
library(sjPlot)
view_df(df, verbose = F)

Data frame: df
ID	Name	Label	Values	Value Labels
1	country	Country name		Algeria Argentina Armenia Australia Azerbaijan Bahrain Belarus Brazil Chile China Colombia Cyprus Ecuador Egypt Estonia <… truncated>
2	GDP.ppp.pc.us	GDP per capita, PPP	range: 1535.5-122609.4
3	Int.users.per.100	Internet per 100	range: 7.1-92.9
4	Status	Freedom House Index		Free Partly free Unfree
5	democracy	Democracy-Dictatorship		No Yes
6	P4	Polity IV		-10 -9 -7 -6 -4 -3 -2 2 3 4 5 6 7 8 9 <… truncated>

library(psych)
describe(df)

##                   vars  n     mean       sd   median  trimmed      mad
## country*             1 60    30.50    17.46    30.50    30.50    22.24
## GDP.ppp.pc.us        2 57 23234.14 21823.75 16457.09 19795.13 13988.32
## Int.users.per.100    3 58    49.26    23.42    48.70    49.17    28.32
## Status*              4 60     1.83     0.81     2.00     1.79     1.48
## democracy*           5 58     1.60     0.49     2.00     1.62     0.00
## P4*                  6 56    11.25     4.86    13.00    11.74     4.45
##                       min       max     range  skew kurtosis      se
## country*             1.00     60.00     59.00  0.00    -1.26    2.25
## GDP.ppp.pc.us     1535.48 122609.41 121073.93  2.18     6.18 2890.63
## Int.users.per.100    7.10     92.86     85.76  0.03    -0.96    3.08
## Status*              1.00      3.00      2.00  0.30    -1.43    0.10
## democracy*           1.00      2.00      1.00 -0.41    -1.86    0.06
## P4*                  1.00     16.00     15.00 -0.74    -0.89    0.65

0. Scatterplot and correlation

plot(Int.users.per.100 ~ GDP.ppp.pc.us, data = df)

cor.test(df$Int.users.per.100, df$GDP.ppp.pc.us)

## 
##  Pearson's product-moment correlation
## 
## data:  df$Int.users.per.100 and df$GDP.ppp.pc.us
## t = 6.8129, df = 55, p-value = 7.637e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5049674 0.7966539
## sample estimates:
##       cor 
## 0.6765198

1. Regress Internet users on GPD PPP (natural scales)

regr <- lm(Int.users.per.100 ~ GDP.ppp.pc.us, data = df)
summary(regr)

## 
## Call:
## lm(formula = Int.users.per.100 ~ GDP.ppp.pc.us, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -52.410 -10.327   0.806  12.070  28.745 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   3.307e+01  3.357e+00   9.850 9.57e-14 ***
## GDP.ppp.pc.us 7.205e-04  1.058e-04   6.813 7.64e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 17.27 on 55 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.4577, Adjusted R-squared:  0.4478 
## F-statistic: 46.42 on 1 and 55 DF,  p-value: 7.637e-09

anova(regr)

## Analysis of Variance Table
## 
## Response: Int.users.per.100
##               Df Sum Sq Mean Sq F value    Pr(>F)    
## GDP.ppp.pc.us  1  13846 13846.1  46.416 7.637e-09 ***
## Residuals     55  16407   298.3                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(regr)

plot_model(regr)

df[40,1]

## [1] Qatar
## 60 Levels: Algeria Argentina Armenia Australia Azerbaijan ... Zimbabwe

df[47,1]

## [1] South Korea
## 60 Levels: Algeria Argentina Armenia Australia Azerbaijan ... Zimbabwe

df[21,1]

## [1] Iraq
## 60 Levels: Algeria Argentina Armenia Australia Azerbaijan ... Zimbabwe

2. Regress Internet users on log GPD PPP

regr2 <- lm(Int.users.per.100 ~ log(GDP.ppp.pc.us), data = df)
summary(regr2)

## 
## Call:
## lm(formula = Int.users.per.100 ~ log(GDP.ppp.pc.us), data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.436  -7.716   0.998   8.193  22.431 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -151.951     18.049  -8.419 1.82e-11 ***
## log(GDP.ppp.pc.us)   20.865      1.858  11.229 7.46e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12.92 on 55 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.6963, Adjusted R-squared:  0.6908 
## F-statistic: 126.1 on 1 and 55 DF,  p-value: 7.458e-16

anova(regr2)

## Analysis of Variance Table
## 
## Response: Int.users.per.100
##                    Df Sum Sq Mean Sq F value    Pr(>F)    
## log(GDP.ppp.pc.us)  1  21065 21064.9   126.1 7.458e-16 ***
## Residuals          55   9188   167.1                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(regr2)

plot_model(regr2)

3. Regress Internet users on GPD PPP (standardized scales)

regr3 <- lm(scale(Int.users.per.100) ~ scale(GDP.ppp.pc.us), data = df)
summary(regr3)

## 
## Call:
## lm(formula = scale(Int.users.per.100) ~ scale(GDP.ppp.pc.us), 
##     data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.23789 -0.44097  0.03443  0.51539  1.22740 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           0.02359    0.09768   0.242     0.81    
## scale(GDP.ppp.pc.us)  0.67142    0.09855   6.813 7.64e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7375 on 55 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.4577, Adjusted R-squared:  0.4478 
## F-statistic: 46.42 on 1 and 55 DF,  p-value: 7.637e-09

anova(regr3)

## Analysis of Variance Table
## 
## Response: scale(Int.users.per.100)
##                      Df Sum Sq Mean Sq F value    Pr(>F)    
## scale(GDP.ppp.pc.us)  1 25.245 25.2454  46.416 7.637e-09 ***
## Residuals            55 29.914  0.5439                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

plot(regr3)

plot_model(regr3)

Your First Linear Regression

Olesya Volchenko & Anna Shirokanova

April 4, 2019

Problem set

0. Scatterplot and correlation

1. Regress Internet users on GPD PPP (natural scales)

2. Regress Internet users on log GPD PPP

3. Regress Internet users on GPD PPP (standardized scales)