#Set working directory to folder containing csv file:
setwd("C:/Users/12403/Desktop/MC 2020/MC Fall 2020/DATA110")
#Create object to read csv into:
des_dem <- read.csv("disease_democ.csv")
#Summarize data:
summary(des_dem)
## country income_group democ_score infect_rate
## Afghanistan: 1 High income: non-OECD:16 Min. :15.80 Min. :23.00
## Albania : 1 High income: OECD :31 1st Qu.:28.40 1st Qu.:27.00
## Algeria : 1 Low income :40 Median :38.40 Median :32.00
## Angola : 1 Lower middle income :45 Mean :42.78 Mean :33.33
## Argentina : 1 Upper middle income :36 3rd Qu.:52.65 3rd Qu.:39.00
## Armenia : 1 Max. :86.60 Max. :48.00
## (Other) :162
#Column names:
colnames(des_dem) <- c("Country", "Income Group", "Democracy Score", "Infection Rate")
#Divide into smaller objects per Income Group, this is to analyze each income group separately to understand the relationship between infection rates and democracy scores.
#We will dive into the dataset as a whole afterwards, first we will identify the groups that are "skewing" the data, or better said, that can better explain the data.
hio <- des_dem %>% group_by(`Income Group`) %>% filter(`Income Group` == "High income: OECD")
hino <- des_dem %>% group_by(`Income Group`) %>% filter(`Income Group` == "High income: non-OECD")
umi <- des_dem %>% group_by(`Income Group`) %>% filter(`Income Group` == "Upper middle income")
lmi <- des_dem %>% group_by(`Income Group`) %>% filter(`Income Group` == "Lower middle income")
li <- des_dem %>% group_by(`Income Group`) %>% filter(`Income Group` == "Low income")
#High Income: OECD Countries
#Create ggplot frame:
hiop1 <- ggplot(hio, aes(x = `Democracy Score`, y = `Infection Rate`)) +
xlab("Democracy Score") +
ylab("Infection Rate")
#Add geom_point and titles
hiop2 <- hiop1 + geom_point() + ggtitle("Relationship between Infection Rate and Democracy Score", subtitle = "High Income: OECD")
#Create geom_smooth plot with confidence intervals:
hiop3 <- hiop2 + geom_point() +
theme_minimal() +
geom_smooth(colour = "purple") +
ggtitle("Relationship between Infection Rate and Democracy Score", subtitle = "High Income: OECD (Smoother with CI)")
#Display geom_smooth plot:
hiop3
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Create geom_smooth plot with linear regresssion
hiop4 <- hiop2 + geom_point() +
theme_minimal() +
geom_smooth(method='lm', formula = y ~ x, colour = "purple") +
ggtitle("Inverse correlation between Infection Rate and Democracy Score", subtitle = "High Income: OECD (Linear Regression)")
#Display geom_smooth plot with linear regression
hiop4

#Perform linear regression and display summary, as well as anova table:
hiolm <- lm(`Infection Rate` ~ `Democracy Score`, data=hio)
summary(hiolm)
##
## Call:
## lm(formula = `Infection Rate` ~ `Democracy Score`, data = hio)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6506 -1.4014 -0.5068 1.0347 5.6305
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.28944 2.94071 10.640 1.59e-11 ***
## `Democracy Score` -0.06390 0.03929 -1.626 0.115
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.16 on 29 degrees of freedom
## Multiple R-squared: 0.08359, Adjusted R-squared: 0.05199
## F-statistic: 2.645 on 1 and 29 DF, p-value: 0.1147
anova(hiolm)
## Analysis of Variance Table
##
## Response: Infection Rate
## Df Sum Sq Mean Sq F value Pr(>F)
## `Democracy Score` 1 12.345 12.3446 2.6453 0.1147
## Residuals 29 135.333 4.6666
#The results for countries in High Income: OECD group show a negative, but small correlation between Infection Rates and Democracy Scores.
#The correlation may prove to be insignificant, as the t-valuue is within -2 and 2, and the p-value is quite large at 0.1147.
#High Income: non-OECD Countries
#Create ggplot frame:
hinop1 <- ggplot(hino, aes(x = `Democracy Score`, y = `Infection Rate`)) +
xlab("Democracy Score") +
ylab("Infection Rate")
#Add geom_point and titles
hinop2 <- hinop1 + geom_point() + ggtitle("Relationship between Infection Rate and Democracy Score", subtitle = "High Income: non-OECD")
#Create geom_smooth plot with confidence intervals:
hinop3 <- hinop2 + geom_point() +
theme_minimal() +
geom_smooth(colour = "black") +
ggtitle("Relationship between Infection Rate and Democracy Score", subtitle = "High Income: non-OECD (Smoother with CI)")
#Display geom_smooth plot:
hinop3
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Create geom_smooth plot with linear regresssion
hinop4 <- hinop2 + geom_point() +
theme_minimal() +
geom_smooth(method='lm', formula = y ~ x, colour = "black") +
ggtitle("Inverse correlation between Infection Rate and Democracy Score", subtitle = "High Income: non-OECD (Linear Regression)")
#Display geom_smooth plot with linear regression:
hinop4

#Perform linear regression and display summary, as well as anova table:
hinolm <- lm(`Infection Rate` ~ `Democracy Score`, data=hino)
summary(hinolm)
##
## Call:
## lm(formula = `Infection Rate` ~ `Democracy Score`, data = hino)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.8914 -2.7439 -0.0859 1.9043 7.1948
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.33270 3.91577 9.279 2.34e-07 ***
## `Democracy Score` -0.16319 0.07434 -2.195 0.0455 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.844 on 14 degrees of freedom
## Multiple R-squared: 0.256, Adjusted R-squared: 0.2029
## F-statistic: 4.818 on 1 and 14 DF, p-value: 0.04552
anova(hinolm)
## Analysis of Variance Table
##
## Response: Infection Rate
## Df Sum Sq Mean Sq F value Pr(>F)
## `Democracy Score` 1 71.182 71.182 4.8185 0.04552 *
## Residuals 14 206.818 14.773
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The results for countries in High Income: non-OECD group show a negative, but significant correlation between Infection Rates and Democracy Scores.
#The correlation is significant as the t-value sits comfortably outside -2 and 2, and the p-value is less than 0.05.
#However, when looking at the values for r-squared, we see that the Democracy Score only explains up to 20% of the model.
#Upper Middle Income Countries
#Create ggplot frame:
umip1 <- ggplot(umi, aes(x = `Democracy Score`, y = `Infection Rate`)) +
xlab("Democracy Score") +
ylab("Infection Rate")
#Add geom_point and titles
umip2 <- umip1 + geom_point() + ggtitle("Relationship between Infection Rate and Democracy Score", subtitle = "Upper middle income")
#Create geom_smooth plot with confidence intervals:
umip3 <- umip2 + geom_point() +
theme_minimal() +
geom_smooth(colour = "green") +
ggtitle("Relationship between Infection Rate and Democracy Score", subtitle = "Upper middle income (Smoother with CI)")
#Display geom_smooth plot:
umip3
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Create geom_smooth plot with linear regresssion
umip4 <- umip2 + geom_point() +
theme_minimal() +
geom_smooth(method='lm', formula = y ~ x, colour = "green") +
ggtitle("Inverse correlation between Infection Rate and Democracy Score", subtitle = "Upper middle income (Linear Regression)")
#Display geom_smooth plot with linear regression:
umip4

#Perform linear regression and display summary, as well as anova table:
umilm <- lm(`Infection Rate` ~ `Democracy Score`, data=umi)
summary(umilm)
##
## Call:
## lm(formula = `Infection Rate` ~ `Democracy Score`, data = umi)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.101 -5.144 -0.587 4.784 11.143
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 40.2477 6.4965 6.195 4.81e-07 ***
## `Democracy Score` -0.1718 0.1493 -1.151 0.258
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.983 on 34 degrees of freedom
## Multiple R-squared: 0.03748, Adjusted R-squared: 0.009172
## F-statistic: 1.324 on 1 and 34 DF, p-value: 0.2579
anova(umilm)
## Analysis of Variance Table
##
## Response: Infection Rate
## Df Sum Sq Mean Sq F value Pr(>F)
## `Democracy Score` 1 47.39 47.388 1.324 0.2579
## Residuals 34 1216.92 35.792
#The results for countries in the Upper Middle Income group shows a negative, but small correlation between Infection Rates and Democracy Scores.
#The correlation may also prove to be insignificant, as the t-valuue is within -2 and 2, and the p-value is very large at 0.2579.
#Lower middle income
#Create ggplot frame:
lmip1 <- ggplot(lmi, aes(x = `Democracy Score`, y = `Infection Rate`)) +
xlab("Democracy Score") +
ylab("Infection Rate")
#Add geom_point and titles
lmip2 <- lmip1 + geom_point() + ggtitle("Relationship between Infection Rate and Democracy Score", subtitle = "Lower middle income")
#Create geom_smooth plot with confidence intervals:
lmip3 <- lmip2 + geom_point() +
theme_minimal() +
geom_smooth(colour = "red") +
ggtitle("Relationship between Infection Rate and Democracy Score", subtitle = "Lower middle income (Smoother with CI)")
#Display geom_smooth plot:
lmip3
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Create geom_smooth plot with linear regresssion
lmip4 <- lmip2 + geom_point() +
theme_minimal() +
geom_smooth(method='lm', formula = y ~ x, colour = "red") +
ggtitle("Inverse correlation between Infection Rate and Democracy Score", subtitle = "Lower middle income (Linear Regression)")
#Display geom_smooth plot with linear regression:
lmip4

#Perform linear regression and display summary, as well as anova table:
lmilm <- lm(`Infection Rate` ~ `Democracy Score`, data=lmi)
summary(lmilm)
##
## Call:
## lm(formula = `Infection Rate` ~ `Democracy Score`, data = lmi)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.654 -4.011 -0.192 3.965 10.947
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.0225 3.5007 12.575 5.33e-16 ***
## `Democracy Score` -0.2568 0.1000 -2.568 0.0138 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.311 on 43 degrees of freedom
## Multiple R-squared: 0.1329, Adjusted R-squared: 0.1128
## F-statistic: 6.593 on 1 and 43 DF, p-value: 0.0138
anova(lmilm)
## Analysis of Variance Table
##
## Response: Infection Rate
## Df Sum Sq Mean Sq F value Pr(>F)
## `Democracy Score` 1 185.96 185.958 6.593 0.0138 *
## Residuals 43 1212.84 28.206
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The results for countries in the Lower middle income group shows a negative, and strong correlation between Infection Rates and Democracy Scores.
#This is significant as the t-value is well outside of -2 and 2 at -2.568, and the p value is less than 5% at 0.0138.
#The r-squared however is not promising at a 0.1128.
#The values for countries in the lower middle income group show significance, but requires more data to explain the theory of correlation.
#Low income
#Create ggplot frame:
lip1 <- ggplot(li, aes(x = `Democracy Score`, y = `Infection Rate`)) +
xlab("Democracy Score") +
ylab("Infection Rate")
#Add geom_point and titles
lip2 <- lip1 + geom_point() + ggtitle("Relationship between Infection Rate and Democracy Score", subtitle = "Low income")
#Create geom_smooth plot with confidence intervals:
lip3 <- lip2 + geom_point() +
theme_minimal() +
geom_smooth(colour = "orange") +
ggtitle("Relationship between Infection Rate and Democracy Score", subtitle = "Low income (Smoother with CI)")
#Display geom_smooth plot:
lip3
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Create geom_smooth plot with linear regresssion
lip4 <- lip2 + geom_point() +
theme_minimal() +
geom_smooth(method='lm', formula = y ~ x, colour = "orange") +
ggtitle("Inverse correlation between Infection Rate and Democracy Score", subtitle = "Low income (Linear Regression)")
#Display geom_smooth plot with linear regression:
lip4

#Perform linear regression and display summary, as well as anova table:
lilm <- lm(`Infection Rate` ~ `Democracy Score`, data=li)
summary(lilm)
##
## Call:
## lm(formula = `Infection Rate` ~ `Democracy Score`, data = li)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.7987 -4.6756 0.8124 3.9454 8.1696
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.1998 4.4154 11.596 4.77e-14 ***
## `Democracy Score` -0.4960 0.1756 -2.825 0.00749 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.259 on 38 degrees of freedom
## Multiple R-squared: 0.1736, Adjusted R-squared: 0.1518
## F-statistic: 7.98 on 1 and 38 DF, p-value: 0.007494
anova(lilm)
## Analysis of Variance Table
##
## Response: Infection Rate
## Df Sum Sq Mean Sq F value Pr(>F)
## `Democracy Score` 1 220.74 220.744 7.9801 0.007494 **
## Residuals 38 1051.16 27.662
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The results for countries in the Low income group show a negative, but strong correlation between Infection Rates and Democracy Scores, it is the strongest amongst the groups.
#The correlation is quite significant, with each additional point of democracy score lowering infection rates by 0.4960.
#The t-value is very strong as well, at -2.825, and the p-value is miniscule, at 0.00749.
#Although the correlation is faint at higher levels of democracy scores, the correlation is very strong at lower levels of both democracy and income.
#Democracies and Disease
#Create ggplot frame:
p1 <- ggplot(des_dem, aes(x = `Democracy Score`, y = `Infection Rate`, colour = `Income Group`)) +
xlab("Democracy Score") +
ylab("Infection Rate")
#Add geom_point and titles
p2 <- p1 + geom_point() + ggtitle("Relationship between Infection Rate and Democracy Score")
#Create geom_smooth plot with confidence intervals:
p3 <- p2 + geom_point() +
theme_minimal() +
geom_smooth(colour = "blue") +
ggtitle("Relationship between Infection Rate and Democracy Score", subtitle ="(Smoother with CI)")
#Display geom_smooth plot as facet wrap:
p3 + facet_wrap(des_dem$`Income Group`)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Create geom_smooth plot with linear regresssion
p4 <- p2 + geom_point() +
theme_minimal() +
geom_smooth(method='lm', formula = y ~ x, colour = "blue") +
ggtitle("Inverse correlation between Infection Rate and Democracy Score", subtitle ="(Linear Regression)")
#Display geom_smooth plot with linear regression:
p4

#Create palette to match previous plots and use for interactive plot
pal <- c("purple", "black", "green", "red", "orange")
#Create interactive plot with details
p5 <- plot_ly(data = des_dem, x = ~`Democracy Score`, y = ~`Infection Rate`, color = ~`Income Group`, colors = pal, text = ~paste(`Country`))
p6 <- p5 %>% layout(title = "Relationship between Infection Rate and Democracy Score",
xaxis = list(title = "Democracy Score"),
yaxis = list(title = "Infection Rate"))
#Display interactive plot:
p6
## No trace type specified:
## Based on info supplied, a 'scatter' trace seems appropriate.
## Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#Perform linear regression and display summary, as well as anova table:
lm1 <- lm(`Infection Rate` ~ `Democracy Score`, data=des_dem)
summary(lm1)
##
## Call:
## lm(formula = `Infection Rate` ~ `Democracy Score`, data = des_dem)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.6506 -3.7633 0.2188 3.6332 10.4621
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 43.59815 0.97374 44.77 <2e-16 ***
## `Democracy Score` -0.24008 0.02084 -11.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.071 on 166 degrees of freedom
## Multiple R-squared: 0.4442, Adjusted R-squared: 0.4409
## F-statistic: 132.7 on 1 and 166 DF, p-value: < 2.2e-16
anova(lm1)
## Analysis of Variance Table
##
## Response: Infection Rate
## Df Sum Sq Mean Sq F value Pr(>F)
## `Democracy Score` 1 3412 3412.0 132.67 < 2.2e-16 ***
## Residuals 166 4269 25.7
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The results for all countries, and the dataset as a whole, shows a negative, and very strong correlation between Infection Rates and Democracy Scores.
#The correlation is significant.The t-value is very strong, sitting way beyond the boundaries of -2 and 2 at -2.825. Supporting this at 0.001 confidence level is a p-value less than 2.2e-16.
#Although the correlation is significant for the dataset as a whole,, the low R-squared of .4409 shows there are other factors at play, and other variables that may be skewing the data.