library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(readr)
library(ggplot2)
# Load the dataset
setwd("C:/Users/tycho/Desktop/DATA110")
disease_democ <- read_csv("disease_democ.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   country = col_character(),
##   income_group = col_character(),
##   democ_score = col_double(),
##   infect_rate = col_double()
## )
head(disease_democ)
## # A tibble: 6 x 4
##   country      income_group          democ_score infect_rate
##   <chr>        <chr>                       <dbl>       <dbl>
## 1 Bahrain      High income: non-OECD        45.6          23
## 2 Bahamas, The High income: non-OECD        48.4          24
## 3 Qatar        High income: non-OECD        50.4          24
## 4 Latvia       High income: non-OECD        52.8          25
## 5 Barbados     High income: non-OECD        46            26
## 6 Singapore    High income: non-OECD        64            26
# Use tibble to read and analyze the dataset
tibble_democ <- read_csv("disease_democ.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   country = col_character(),
##   income_group = col_character(),
##   democ_score = col_double(),
##   infect_rate = col_double()
## )
str(tibble_democ)
## spec_tbl_df [168 x 4] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ country     : chr [1:168] "Bahrain" "Bahamas, The" "Qatar" "Latvia" ...
##  $ income_group: chr [1:168] "High income: non-OECD" "High income: non-OECD" "High income: non-OECD" "High income: non-OECD" ...
##  $ democ_score : num [1:168] 45.6 48.4 50.4 52.8 46 64 65.8 70.6 57.6 40.6 ...
##  $ infect_rate : num [1:168] 23 24 24 25 26 26 26 26 27 28 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   country = col_character(),
##   ..   income_group = col_character(),
##   ..   democ_score = col_double(),
##   ..   infect_rate = col_double()
##   .. )
summary(tibble_democ)
##    country          income_group        democ_score     infect_rate   
##  Length:168         Length:168         Min.   :15.80   Min.   :23.00  
##  Class :character   Class :character   1st Qu.:28.40   1st Qu.:27.00  
##  Mode  :character   Mode  :character   Median :38.40   Median :32.00  
##                                        Mean   :42.78   Mean   :33.33  
##                                        3rd Qu.:52.65   3rd Qu.:39.00  
##                                        Max.   :86.60   Max.   :48.00
# Boxplot Democracy Score data
tibble_democ %>% 
  ggplot() +
  geom_boxplot(aes(x = democ_score)) +
  xlab("Democracy Score") +
  ggtitle("Democracy Score Distribution")

# Boxplot Infection Rate data
tibble_democ %>% 
  ggplot() +
  geom_boxplot(aes(x = infect_rate)) +
  xlab("Infection Rate") +
  ggtitle("Infection Rate Distribution")

# Making a Scatterplot chart with trend line
disease_democ <- read_csv("disease_democ.csv")
## 
## -- Column specification --------------------------------------------------------
## cols(
##   country = col_character(),
##   income_group = col_character(),
##   democ_score = col_double(),
##   infect_rate = col_double()
## )
# Defining the variables, cleaning data, setting colors 
disease_democ_scatter <- ggplot(disease_democ, aes(x = infect_rate, y = democ_score, text=paste(country,"<br>", "Infection rate: ",infect_rate,"<br>", "Democracy score: ",democ_score), group = 1)) + ggtitle("Country Infection Prevalence vs Democratization") + xlab("Infectious disease rate") + ylab("Democracy score") +
  geom_point(aes(color=income_group), size = 2.5, alpha = 0.5) +
  geom_smooth(method = lm, se = FALSE) + scale_color_brewer(palette = "Set1", name="Income group", breaks=c("High income: OECD","High income: non-OECD","Upper middle income","Lower middle income","Low income"))

# Making the chart interactive
disease_democ_interactive <- ggplotly(disease_democ_scatter, tooltip="text") %>% 
  config(displayModeBar = FALSE)
## `geom_smooth()` using formula 'y ~ x'
# Plot the chart
ggplotly(disease_democ_interactive)

This dataset illustrates a controversial theory suggested by evolutionary biologist Randy Thornhill that the emergence of democratic political systems has depended largely on nations having low rates of infectious disease, from the Global Infectious Diseases and Epidemiology Network (GIDEON)1, and Democratization: A Comparative Analysis of 170 Countries2. In this data, countries are given a number based on their Democracy Scores and Infection Rates (democ_score and infect_rate respectively). Democracy Scores range between 0-100 with higher numbers equating to a higher level of democratization. Infection Rates are classified by higher numbers equating to a higher rate of infection in the country. The countries are classified into five income groups (income_group): low income, lower middle income, upper middle income, high income: non-OECD, and high income: OECD. OECD referring to the Organization for Economic Co-operation and Development, of which many 1st world countries are a part of including the US, Japan, Australia, and much of Europe. In my analysis, I first used tibble to obtain some basic statistical data. The democracy scores ranged between 15.80 and 86.60 while the infection rate ranged between 23 and 48. Then I decided to look at each of the variables separately to see where the overall range of the data was by making two boxplots. Both box plots looked almost identical, with the median sitting in the lower range and with tall wicks in the higher ranges. I did not need to clean the data much since the dataset did not contain too many variables or datapoints and did not have any NA’s. For my main visualization, I chose a scatterplot since the variables I was using paired well together and it’s a great way to see if a trend can be formed. Looking at the plot, a trend could indeed be formed, as countries having higher democracy scores had lower rates of infection while countries having lower democracy scores had higher rates of infection. Something notable in the plot was that there was a higher deviation in the democracy score for countries which had low infection rates than those which had high ones. This is especially true for the High income: OECD countries, all of which are either on or above the trend line. I also noticed that in general, high income OECD countries had the highest democracy scores and the lowest infection rates while low income countries had the lowest democracy scores and the highest infection rates. While this information is unsurprising, it does not necessarily support the claim that emergence of democracies depends on nations having low rates of infectious disease. More information is needed to assert that claim and there are many exogenous factors that could have an impact on both of the variables I analyzed. For example, a country could have low rate of disease but have insufficient resources to build up it’s infrastructure, or the presence of civil unrest/war, etc…. In future studies, I would like to see other variables such as resource deposits, economic data or technological advancements included to get a better picture of what causes countries to democratize.