#Set working directory to folder containing csv file:
setwd("C:/Users/12403/Desktop/MC 2020/MC Fall 2020/DATA110")
#Create object to read csv into:
des_dem <- read.csv("disease_democ.csv")

#Summarize data:
summary(des_dem)
##         country                   income_group  democ_score     infect_rate   
##  Afghanistan:  1   High income: non-OECD:16    Min.   :15.80   Min.   :23.00  
##  Albania    :  1   High income: OECD    :31    1st Qu.:28.40   1st Qu.:27.00  
##  Algeria    :  1   Low income           :40    Median :38.40   Median :32.00  
##  Angola     :  1   Lower middle income  :45    Mean   :42.78   Mean   :33.33  
##  Argentina  :  1   Upper middle income  :36    3rd Qu.:52.65   3rd Qu.:39.00  
##  Armenia    :  1                               Max.   :86.60   Max.   :48.00  
##  (Other)    :162
#Column names:
colnames(des_dem) <- c("Country", "Income Group", "Democracy Score", "Infection Rate")
#Divide into smaller objects per Income Group, this is to analyze each income group separately to understand the relationship between infection rates and democracy scores.
#We will dive into the dataset as a whole afterwards, first we will identify the groups that are "skewing" the data, or better said, that can better explain the data.
hio <- des_dem %>% group_by(`Income Group`) %>% filter(`Income Group` == "High income: OECD")
hino <- des_dem %>% group_by(`Income Group`) %>% filter(`Income Group` == "High income: non-OECD")
umi <- des_dem %>% group_by(`Income Group`) %>% filter(`Income Group` == "Upper middle income")
lmi <- des_dem %>% group_by(`Income Group`) %>% filter(`Income Group` == "Lower middle income")
li <- des_dem %>% group_by(`Income Group`) %>% filter(`Income Group` == "Low income")
#High Income: OECD Countries
#Create ggplot frame:
hiop1 <- ggplot(hio, aes(x = `Democracy Score`, y = `Infection Rate`)) +
         xlab("Democracy Score") +
         ylab("Infection Rate")

#Add geom_point and titles
hiop2 <- hiop1 + geom_point() + ggtitle("Relationship between Infection Rate and Democracy Score", subtitle = "High Income: OECD")

#Create geom_smooth plot with confidence intervals:
hiop3 <- hiop2 + geom_point() + 
           theme_minimal() +
           geom_smooth(colour = "purple") +
           ggtitle("Relationship between Infection Rate and Democracy Score", subtitle = "High Income: OECD (Smoother with CI)")
#Display geom_smooth plot:
hiop3
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Create geom_smooth plot with linear regresssion
hiop4 <- hiop2 + geom_point() + 
           theme_minimal() +
           geom_smooth(method='lm', formula = y ~ x, colour = "purple") +
           ggtitle("Inverse correlation between Infection Rate and Democracy Score", subtitle = "High Income: OECD (Linear Regression)")

#Display geom_smooth plot with linear regression
hiop4

#Perform linear regression and display summary, as well as anova table:
hiolm <- lm(`Infection Rate` ~ `Democracy Score`, data=hio)

summary(hiolm)
## 
## Call:
## lm(formula = `Infection Rate` ~ `Democracy Score`, data = hio)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6506 -1.4014 -0.5068  1.0347  5.6305 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       31.28944    2.94071  10.640 1.59e-11 ***
## `Democracy Score` -0.06390    0.03929  -1.626    0.115    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.16 on 29 degrees of freedom
## Multiple R-squared:  0.08359,    Adjusted R-squared:  0.05199 
## F-statistic: 2.645 on 1 and 29 DF,  p-value: 0.1147
anova(hiolm)
## Analysis of Variance Table
## 
## Response: Infection Rate
##                   Df  Sum Sq Mean Sq F value Pr(>F)
## `Democracy Score`  1  12.345 12.3446  2.6453 0.1147
## Residuals         29 135.333  4.6666
#The results for countries in High Income: OECD group show a negative, but small correlation between Infection Rates and Democracy Scores.
#The correlation may prove to be insignificant, as the t-valuue is within -2 and 2, and the p-value is quite large at 0.1147.
#High Income: non-OECD Countries
#Create ggplot frame:
hinop1 <- ggplot(hino, aes(x = `Democracy Score`, y = `Infection Rate`)) +
         xlab("Democracy Score") +
         ylab("Infection Rate")

#Add geom_point and titles
hinop2 <- hinop1 + geom_point() + ggtitle("Relationship between Infection Rate and Democracy Score", subtitle = "High Income: non-OECD")

#Create geom_smooth plot with confidence intervals:
hinop3 <- hinop2 + geom_point() + 
           theme_minimal() +
           geom_smooth(colour = "black") +
           ggtitle("Relationship between Infection Rate and Democracy Score", subtitle = "High Income: non-OECD (Smoother with CI)")

#Display geom_smooth plot:
hinop3
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Create geom_smooth plot with linear regresssion
hinop4 <- hinop2 + geom_point() + 
           theme_minimal() +
           geom_smooth(method='lm', formula = y ~ x, colour = "black") +
           ggtitle("Inverse correlation between Infection Rate and Democracy Score", subtitle = "High Income: non-OECD (Linear Regression)")

#Display geom_smooth plot with linear regression:
hinop4

#Perform linear regression and display summary, as well as anova table:
hinolm <- lm(`Infection Rate` ~ `Democracy Score`, data=hino)

summary(hinolm)
## 
## Call:
## lm(formula = `Infection Rate` ~ `Democracy Score`, data = hino)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.8914 -2.7439 -0.0859  1.9043  7.1948 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       36.33270    3.91577   9.279 2.34e-07 ***
## `Democracy Score` -0.16319    0.07434  -2.195   0.0455 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.844 on 14 degrees of freedom
## Multiple R-squared:  0.256,  Adjusted R-squared:  0.2029 
## F-statistic: 4.818 on 1 and 14 DF,  p-value: 0.04552
anova(hinolm)
## Analysis of Variance Table
## 
## Response: Infection Rate
##                   Df  Sum Sq Mean Sq F value  Pr(>F)  
## `Democracy Score`  1  71.182  71.182  4.8185 0.04552 *
## Residuals         14 206.818  14.773                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The results for countries in High Income: non-OECD group show a negative, but significant correlation between Infection Rates and Democracy Scores.
#The correlation is significant as the t-value sits comfortably outside -2 and 2, and the p-value is less than 0.05.
#However, when looking at the values for r-squared, we see that the Democracy Score only explains up to 20% of the model.
#Upper Middle Income Countries
#Create ggplot frame:
umip1 <- ggplot(umi, aes(x = `Democracy Score`, y = `Infection Rate`)) +
         xlab("Democracy Score") +
         ylab("Infection Rate")

#Add geom_point and titles
umip2 <- umip1 + geom_point() + ggtitle("Relationship between Infection Rate and Democracy Score", subtitle = "Upper middle income")

#Create geom_smooth plot with confidence intervals:
umip3 <- umip2 + geom_point() + 
           theme_minimal() +
           geom_smooth(colour = "green") +
           ggtitle("Relationship between Infection Rate and Democracy Score", subtitle = "Upper middle income (Smoother with CI)")

#Display geom_smooth plot:
umip3
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Create geom_smooth plot with linear regresssion
umip4 <- umip2 + geom_point() + 
           theme_minimal() +
           geom_smooth(method='lm', formula = y ~ x, colour = "green") +
           ggtitle("Inverse correlation between Infection Rate and Democracy Score", subtitle = "Upper middle income (Linear Regression)")

#Display geom_smooth plot with linear regression:
umip4

#Perform linear regression and display summary, as well as anova table:
umilm <- lm(`Infection Rate` ~ `Democracy Score`, data=umi)

summary(umilm)
## 
## Call:
## lm(formula = `Infection Rate` ~ `Democracy Score`, data = umi)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.101 -5.144 -0.587  4.784 11.143 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        40.2477     6.4965   6.195 4.81e-07 ***
## `Democracy Score`  -0.1718     0.1493  -1.151    0.258    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.983 on 34 degrees of freedom
## Multiple R-squared:  0.03748,    Adjusted R-squared:  0.009172 
## F-statistic: 1.324 on 1 and 34 DF,  p-value: 0.2579
anova(umilm)
## Analysis of Variance Table
## 
## Response: Infection Rate
##                   Df  Sum Sq Mean Sq F value Pr(>F)
## `Democracy Score`  1   47.39  47.388   1.324 0.2579
## Residuals         34 1216.92  35.792
#The results for countries in the Upper Middle Income group shows a negative, but small correlation between Infection Rates and Democracy Scores.
#The correlation may also prove to be insignificant, as the t-valuue is within -2 and 2, and the p-value is very large at 0.2579.
#Lower middle income
#Create ggplot frame:
lmip1 <- ggplot(lmi, aes(x = `Democracy Score`, y = `Infection Rate`)) +
         xlab("Democracy Score") +
         ylab("Infection Rate")

#Add geom_point and titles
lmip2 <- lmip1 + geom_point() + ggtitle("Relationship between Infection Rate and Democracy Score", subtitle = "Lower middle income")

#Create geom_smooth plot with confidence intervals:
lmip3 <- lmip2 + geom_point() + 
           theme_minimal() +
           geom_smooth(colour = "red") +
           ggtitle("Relationship between Infection Rate and Democracy Score", subtitle = "Lower middle income (Smoother with CI)")

#Display geom_smooth plot:
lmip3
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Create geom_smooth plot with linear regresssion
lmip4 <- lmip2 + geom_point() + 
           theme_minimal() +
           geom_smooth(method='lm', formula = y ~ x, colour = "red") +
           ggtitle("Inverse correlation between Infection Rate and Democracy Score", subtitle = "Lower middle income (Linear Regression)")

#Display geom_smooth plot with linear regression:
lmip4

#Perform linear regression and display summary, as well as anova table:
lmilm <- lm(`Infection Rate` ~ `Democracy Score`, data=lmi)

summary(lmilm)
## 
## Call:
## lm(formula = `Infection Rate` ~ `Democracy Score`, data = lmi)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.654 -4.011 -0.192  3.965 10.947 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        44.0225     3.5007  12.575 5.33e-16 ***
## `Democracy Score`  -0.2568     0.1000  -2.568   0.0138 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.311 on 43 degrees of freedom
## Multiple R-squared:  0.1329, Adjusted R-squared:  0.1128 
## F-statistic: 6.593 on 1 and 43 DF,  p-value: 0.0138
anova(lmilm)
## Analysis of Variance Table
## 
## Response: Infection Rate
##                   Df  Sum Sq Mean Sq F value Pr(>F)  
## `Democracy Score`  1  185.96 185.958   6.593 0.0138 *
## Residuals         43 1212.84  28.206                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The results for countries in the Lower middle income group shows a negative, and strong correlation between Infection Rates and Democracy Scores.
#This is significant as the t-value is well outside of -2 and 2 at -2.568, and the p value is less than 5% at 0.0138.
#The r-squared however is not promising at a 0.1128.
#The values for countries in the lower middle income group show significance, but requires more data to explain the theory of correlation.
#Low income
#Create ggplot frame:
lip1 <- ggplot(li, aes(x = `Democracy Score`, y = `Infection Rate`)) +
         xlab("Democracy Score") +
         ylab("Infection Rate")

#Add geom_point and titles
lip2 <- lip1 + geom_point() + ggtitle("Relationship between Infection Rate and Democracy Score", subtitle = "Low income")

#Create geom_smooth plot with confidence intervals:
lip3 <- lip2 + geom_point() + 
           theme_minimal() +
           geom_smooth(colour = "orange") +
           ggtitle("Relationship between Infection Rate and Democracy Score", subtitle = "Low income (Smoother with CI)")

#Display geom_smooth plot:
lip3
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Create geom_smooth plot with linear regresssion
lip4 <- lip2 + geom_point() + 
           theme_minimal() +
           geom_smooth(method='lm', formula = y ~ x, colour = "orange") +
           ggtitle("Inverse correlation between Infection Rate and Democracy Score", subtitle = "Low income (Linear Regression)")

#Display geom_smooth plot with linear regression:
lip4

#Perform linear regression and display summary, as well as anova table:
lilm <- lm(`Infection Rate` ~ `Democracy Score`, data=li)

summary(lilm)
## 
## Call:
## lm(formula = `Infection Rate` ~ `Democracy Score`, data = li)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.7987  -4.6756   0.8124   3.9454   8.1696 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        51.1998     4.4154  11.596 4.77e-14 ***
## `Democracy Score`  -0.4960     0.1756  -2.825  0.00749 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.259 on 38 degrees of freedom
## Multiple R-squared:  0.1736, Adjusted R-squared:  0.1518 
## F-statistic:  7.98 on 1 and 38 DF,  p-value: 0.007494
anova(lilm)
## Analysis of Variance Table
## 
## Response: Infection Rate
##                   Df  Sum Sq Mean Sq F value   Pr(>F)   
## `Democracy Score`  1  220.74 220.744  7.9801 0.007494 **
## Residuals         38 1051.16  27.662                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The results for countries in the Low income group show a negative, but strong correlation between Infection Rates and Democracy Scores, it is the strongest amongst the groups.
#The correlation is quite significant, with each additional point of democracy score lowering infection rates by 0.4960. 
#The t-value is very strong as well, at -2.825, and the p-value is miniscule, at 0.00749.
#Although the correlation is faint at higher levels of democracy scores, the correlation is very strong at lower levels of both democracy and income.
#Democracies and Disease
#Create ggplot frame:
p1 <- ggplot(des_dem, aes(x = `Democracy Score`, y = `Infection Rate`, colour = `Income Group`)) +
         xlab("Democracy Score") +
         ylab("Infection Rate")

#Add geom_point and titles
p2 <- p1 + geom_point() + ggtitle("Relationship between Infection Rate and Democracy Score")

#Create geom_smooth plot with confidence intervals:
p3 <- p2 + geom_point() + 
           theme_minimal() +
           geom_smooth(colour = "blue") +
           ggtitle("Relationship between Infection Rate and Democracy Score", subtitle ="(Smoother with CI)")

#Display geom_smooth plot as facet wrap:
p3 + facet_wrap(des_dem$`Income Group`)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

#Create geom_smooth plot with linear regresssion
p4 <- p2 + geom_point() + 
           theme_minimal() +
           geom_smooth(method='lm', formula = y ~ x, colour = "blue") +
           ggtitle("Inverse correlation between Infection Rate and Democracy Score", subtitle ="(Linear Regression)")

#Display geom_smooth plot with linear regression:
p4

#Create palette to match previous plots and use for interactive plot
pal <- c("purple", "black", "green", "red", "orange")

#Create interactive plot with details
p5 <- plot_ly(data = des_dem, x = ~`Democracy Score`, y = ~`Infection Rate`, color = ~`Income Group`, colors = pal, text = ~paste(`Country`))
p6 <- p5 %>% layout(title = "Relationship between Infection Rate and Democracy Score",
                    xaxis = list(title = "Democracy Score"),
                    yaxis = list(title = "Infection Rate"))

#Display interactive plot:
p6
## No trace type specified:
##   Based on info supplied, a 'scatter' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter
## No scatter mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#Perform linear regression and display summary, as well as anova table:
lm1 <- lm(`Infection Rate` ~ `Democracy Score`, data=des_dem)

summary(lm1)
## 
## Call:
## lm(formula = `Infection Rate` ~ `Democracy Score`, data = des_dem)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.6506 -3.7633  0.2188  3.6332 10.4621 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       43.59815    0.97374   44.77   <2e-16 ***
## `Democracy Score` -0.24008    0.02084  -11.52   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.071 on 166 degrees of freedom
## Multiple R-squared:  0.4442, Adjusted R-squared:  0.4409 
## F-statistic: 132.7 on 1 and 166 DF,  p-value: < 2.2e-16
anova(lm1)
## Analysis of Variance Table
## 
## Response: Infection Rate
##                    Df Sum Sq Mean Sq F value    Pr(>F)    
## `Democracy Score`   1   3412  3412.0  132.67 < 2.2e-16 ***
## Residuals         166   4269    25.7                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#The results for all countries, and the dataset as a whole, shows a negative, and very strong correlation between Infection Rates and Democracy Scores.
#The correlation is significant.The t-value is very strong, sitting way beyond the boundaries of -2 and 2 at -2.825. Supporting this at 0.001 confidence level is a p-value less than 2.2e-16.
#Although the correlation is significant for the dataset as a whole,, the low R-squared of .4409 shows there are other factors at play, and other variables that may be skewing the data.