Install Packages
#install.packages("dplyr")
#install.packages("ggplot2")
#install.packages("tidyverse")
#install.packages("plotly")
Load libraries
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.5.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.3
## -- Attaching packages -------------------------------------------------- tidyverse 1.2.1 --
## v tibble 2.0.1 v purrr 0.3.0
## v tidyr 0.8.2 v stringr 1.3.1
## v readr 1.3.1 v forcats 0.3.0
## -- Conflicts ----------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(plotly)
## Warning: package 'plotly' was built under R version 3.5.3
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
Read files
disease_democ <-
read.csv("c:/Users/dwilliams/documents/montgomery/data 110/data/disease_democ.csv")
head(disease_democ)
## country income_group democ_score infect_rate
## 1 Bahrain High income: non-OECD 45.6 23
## 2 Bahamas, The High income: non-OECD 48.4 24
## 3 Qatar High income: non-OECD 50.4 24
## 4 Latvia High income: non-OECD 52.8 25
## 5 Barbados High income: non-OECD 46.0 26
## 6 Singapore High income: non-OECD 64.0 26
Filter into income groups
low_income <- filter(disease_democ,income_group == 'Low income')
lower_middle_income <- filter(disease_democ,income_group == 'Lower middle income')
upper_middle_income <- filter(disease_democ,income_group == 'Upper middle income')
high_nonoecd_income <- filter(disease_democ,income_group == 'High income: non-OECD')
high_oecd_income <- filter(disease_democ,income_group == 'High income: OECD')
Rename columns
disease_democ <- rename(disease_democ,Democracy = democ_score, Infections = infect_rate)
low_income <- rename(low_income,Democracy = democ_score, Infections = infect_rate)
lower_middle_income <- rename(lower_middle_income,Democracy = democ_score, Infections = infect_rate)
upper_middle_income <- rename(upper_middle_income,Democracy = democ_score, Infections = infect_rate)
high_nonoecd_income <- rename(high_nonoecd_income,Democracy = democ_score, Infections = infect_rate)
high_oecd_income <- rename(high_oecd_income,Democracy = democ_score, Infections = infect_rate)
View groups
View (disease_democ)
View(low_income)
View (lower_middle_income)
View (upper_middle_income)
View (high_oecd_income)
View (high_nonoecd_income)
Get summary statistics and standard deviation
# Low Income Summary
summary(low_income)
## country income_group Democracy
## Afghanistan : 1 High income: non-OECD: 0 Min. :15.80
## Bangladesh : 1 High income: OECD : 0 1st Qu.:21.55
## Benin : 1 Low income :40 Median :24.60
## Burkina Faso: 1 Lower middle income : 0 Mean :24.70
## Burundi : 1 Upper middle income : 0 3rd Qu.:28.10
## Cambodia : 1 Max. :33.00
## (Other) :34
## Infections
## Min. :27.00
## 1st Qu.:35.50
## Median :40.00
## Mean :38.95
## 3rd Qu.:43.00
## Max. :48.00
##
sd(low_income$Infections)
## [1] 5.710764
sd(low_income$Democracy)
## [1] 4.79615
# Lower Middle Income Summary
summary(lower_middle_income)
## country income_group Democracy
## Angola : 1 High income: non-OECD: 0 Min. :22.2
## Armenia : 1 High income: OECD : 0 1st Qu.:28.4
## Belize : 1 Low income : 0 Median :32.4
## Bhutan : 1 Lower middle income :45 Mean :34.1
## Bolivia : 1 Upper middle income : 0 3rd Qu.:39.0
## Côte d'Ivoire: 1 Max. :54.4
## (Other) :39
## Infections
## Min. :26.00
## 1st Qu.:31.00
## Median :35.00
## Mean :35.27
## 3rd Qu.:38.00
## Max. :47.00
##
sd(lower_middle_income$Infections)
## [1] 5.638343
sd(lower_middle_income$Democracy)
## [1] 8.005864
# Upper Middle Income Summary
summary(upper_middle_income)
## country income_group Democracy
## Albania : 1 High income: non-OECD: 0 Min. :31.20
## Algeria : 1 High income: OECD : 0 1st Qu.:37.40
## Argentina : 1 Low income : 0 Median :41.70
## Azerbaijan : 1 Lower middle income : 0 Mean :42.99
## Belarus : 1 Upper middle income :36 3rd Qu.:47.55
## Bosnia and Herzegovina: 1 Max. :57.60
## (Other) :30
## Infections
## Min. :24.00
## 1st Qu.:27.00
## Median :32.50
## Mean :32.86
## 3rd Qu.:37.25
## Max. :45.00
##
sd(upper_middle_income$Infections)
## [1] 6.010243
sd(upper_middle_income$Democracy)
## [1] 6.7728
# High Income OECD Income Summary
summary(high_oecd_income)
## country income_group Democracy
## Australia : 1 High income: non-OECD: 0 Min. :54.0
## Austria : 1 High income: OECD :31 1st Qu.:69.8
## Belgium : 1 Low income : 0 Median :77.8
## Canada : 1 Lower middle income : 0 Mean :74.2
## Czech Republic: 1 Upper middle income : 0 3rd Qu.:81.6
## Denmark : 1 Max. :86.6
## (Other) :25
## Infections
## Min. :23.00
## 1st Qu.:25.00
## Median :26.00
## Mean :26.55
## 3rd Qu.:28.00
## Max. :32.00
##
sd(high_oecd_income$Infections)
## [1] 2.218689
sd(high_oecd_income$Democracy)
## [1] 10.03939
# High Incomme non OECD Income Summary
summary(high_nonoecd_income)
## country income_group Democracy
## Bahamas, The : 1 High income: non-OECD:16 Min. :28.40
## Bahrain : 1 High income: OECD : 0 1st Qu.:44.35
## Barbados : 1 Low income : 0 Median :49.00
## Croatia : 1 Lower middle income : 0 Mean :51.06
## Cyprus : 1 Upper middle income : 0 3rd Qu.:59.20
## Equatorial Guinea: 1 Max. :77.60
## (Other) :10
## Infections
## Min. :23.00
## 1st Qu.:25.75
## Median :26.50
## Mean :28.00
## 3rd Qu.:28.25
## Max. :37.00
##
sd(high_nonoecd_income$Infections)
## [1] 4.305036
sd(high_nonoecd_income$Democracy)
## [1] 13.3492
Create Histograms
# Histogram of all Income levels
ggplot(disease_democ,aes(Infections)) +
geom_histogram(aes(fill=income_group,color = income_group)) +
theme_light() + ggtitle("Infection Rate by Income") +
labs(title ="All Incomes vs Infection Rate",subtitle="Income",
caption="Infections",x="Infections",y="Number")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Histogram ofLow Income
ggplot(low_income,aes(Infections)) +
geom_histogram(aes(fill=income_group,color = income_group)) +
theme_light() + ggtitle("Infection Rate by Income") +
labs(title ="Low Income vs Infection Rate",subtitle="Income",
caption="Infections",x="Infections",y="Number") +
scale_fill_brewer(palette = "Set5")
## Warning in pal_name(palette, type): Unknown palette Set5
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Histogram of Lower Middle Income
ggplot(lower_middle_income,aes(Infections)) +
geom_histogram(aes(fill=income_group,color = income_group)) +
theme_light() + ggtitle("Infection Rate by Income") +
labs(title ="Low Middle Income vs Infection Rate",subtitle="Income",
caption="Infections",x="Infections",y="Number") +
scale_fill_brewer(palette = "Set3")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Histogram of Upper Middle Income
ggplot(upper_middle_income,aes(Infections)) +
geom_histogram(aes(fill=income_group,color = income_group)) +
theme_light() + ggtitle("Infection Rate by Income") +
labs(title ="Upper Middle Income vs Infection Rate",subtitle="Income",
caption="Infections",x="Infections",y="Number") +
scale_fill_brewer(palette = "Set4")
## Warning in pal_name(palette, type): Unknown palette Set4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Histogram of High Income: non-OECD Income
ggplot(high_nonoecd_income,aes(Infections)) +
geom_histogram(aes(fill=income_group,color = income_group)) +
theme_light() + ggtitle("Infection Rate by Income") +
labs(title ="High non-OECD Income vs Infection Rate",subtitle="Income",
caption="Infections",x="Infections",y="Number") +
scale_fill_brewer(palette = "Set1")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Histogram of High Income: OECD Income
ggplot(high_oecd_income,aes(Infections)) +
geom_histogram(aes(fill=income_group,color = income_group)) +
theme_light() + ggtitle("Infection Rate by Income") +
labs(title ="High OECD Income vs Infection Rate",subtitle="Income",
caption="Infections",x="Infections",y="Number") +
scale_fill_brewer(palette = "Set2")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Create Scatter plot
# Scatter plot of democracy vs Infections Rate
p <- ggplot(data=disease_democ,aes(x=Infections,y=Democracy)) +
geom_point(aes(color=income_group)) +
theme_light() + ggtitle("Infection Rate by Income")
labs(title ="Democracy vs Infection Rate",subtitle="Income",
caption="Infections",x="Infections",y="Number")
## $x
## [1] "Infections"
##
## $y
## [1] "Number"
##
## $title
## [1] "Democracy vs Infection Rate"
##
## $subtitle
## [1] "Income"
##
## $caption
## [1] "Infections"
##
## attr(,"class")
## [1] "labels"
p <- ggplotly(p)
p
# Scatter plot of income vs Infections Rate
q <- ggplot(data=disease_democ,mapping = aes(x=income_group,y=Infections)) +
geom_point(aes(color=income_group)) +
theme_light() + ggtitle("Income by Infections")
labs(title ="Income",subtitle="Income",
caption="Income",x="Income",y="Infections") +
theme(axis.title.x=element_blank(),
axis.text.x=element_blank(),
axis.ticks.x=element_blank())
## NULL
q <- ggplotly(q)
q
Create the Linear regression model
fit1 <- lm(Infections ~ Democracy, data = disease_democ)
cor(disease_democ$Infections, disease_democ$Democracy)
## [1] -0.6664911
summary(fit1)
##
## Call:
## lm(formula = Infections ~ Democracy, data = disease_democ)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.6506 -3.7633 0.2188 3.6332 10.4621
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 43.59815 0.97374 44.77 <2e-16 ***
## Democracy -0.24008 0.02084 -11.52 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.071 on 166 degrees of freedom
## Multiple R-squared: 0.4442, Adjusted R-squared: 0.4409
## F-statistic: 132.7 on 1 and 166 DF, p-value: < 2.2e-16