library(rvest)
## Loading required package: xml2
library(tidyverse)
## -- Attaching packages ------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.1 v purrr 0.3.4
## v tibble 3.0.1 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts --------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag() masks stats::lag()
## x purrr::pluck() masks rvest::pluck()
library(ggsci)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(DT)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(cowplot)
##
## ********************************************************
## Note: As of version 1.0.0, cowplot does not change the
## default ggplot2 theme anymore. To recover the previous
## behavior, execute:
## theme_set(theme_cowplot())
## ********************************************************
setwd("C:/Users/Valued Customer/Desktop/Lovebug/Montgomery College/DATA 110/Week 3")
sw_le <- read.csv("SafeWaterLifeExpectancy.csv")
# Changed column name in dataset
sw_le2 <- read.csv("SafeWaterLifeExpectancy_colrename.csv")
# Added column for continent
sw_le3 <- read.csv("SafeWaterLifeExpectancy_colrename_continent.csv")
sw_le4 <- read.csv("SafeWaterLifeExpectancy_colrename_continent.csv")
head(sw_le4)
## Country.Name Perc..Pop..Safe.Water Average.Life.Expectancy..years. Continent
## 1 Uganda 6.43900 59.50876 Africa
## 2 Ethiopia 10.53542 65.00829 Africa
## 3 Nigeria 19.40221 52.97793 Africa
## 4 Cambodia 24.09879 68.47205 Asia
## 5 Nepal 26.75171 69.86985 Asia
## 6 Ghana 26.86492 62.40724 Africa
dim(sw_le4)
## [1] 81 4
str(sw_le4)
## 'data.frame': 81 obs. of 4 variables:
## $ Country.Name : chr "Uganda" "Ethiopia" "Nigeria" "Cambodia" ...
## $ Perc..Pop..Safe.Water : num 6.44 10.54 19.4 24.1 26.75 ...
## $ Average.Life.Expectancy..years.: num 59.5 65 53 68.5 69.9 ...
## $ Continent : chr "Africa" "Africa" "Africa" "Asia" ...
summary(sw_le4)
## Country.Name Perc..Pop..Safe.Water Average.Life.Expectancy..years.
## Length:81 Min. : 6.439 Min. :52.98
## Class :character 1st Qu.: 68.870 1st Qu.:72.22
## Mode :character Median : 91.694 Median :76.64
## Mean : 79.403 Mean :75.68
## 3rd Qu.: 98.024 3rd Qu.:81.39
## Max. :100.000 Max. :83.84
## Continent
## Length:81
## Class :character
## Mode :character
##
##
##
describe(sw_le4)
## Warning in describe(sw_le4): NAs introduced by coercion
## Warning in describe(sw_le4): NAs introduced by coercion
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
## vars n mean sd median trimmed mad min
## Country.Name* 1 81 NaN NA NA NaN NA Inf
## Perc..Pop..Safe.Water 2 81 79.40 24.87 91.69 83.84 10.68 6.44
## Average.Life.Expectancy..years. 3 81 75.68 6.55 76.64 76.51 7.04 52.98
## Continent* 4 81 NaN NA NA NaN NA Inf
## max range skew kurtosis se
## Country.Name* -Inf -Inf NA NA NA
## Perc..Pop..Safe.Water 100.00 93.56 -1.30 0.57 2.76
## Average.Life.Expectancy..years. 83.84 30.87 -1.26 1.90 0.73
## Continent* -Inf -Inf NA NA NA
colnames(sw_le2)<- c("country", "wateraccess", "lifeexpectancy")
colnames(sw_le4)<- c("country", "wateraccess", "lifeexpectancy", "continent")
m5 <- lm(wateraccess~lifeexpectancy, data = sw_le4)
summary(m5)
##
## Call:
## lm(formula = wateraccess ~ lifeexpectancy, data = sw_le4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.107 -6.213 2.114 7.393 32.846
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -143.0949 20.5176 -6.974 8.37e-10 ***
## lifeexpectancy 2.9400 0.2701 10.884 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.83 on 79 degrees of freedom
## Multiple R-squared: 0.5999, Adjusted R-squared: 0.5949
## F-statistic: 118.5 on 1 and 79 DF, p-value: < 2.2e-16
chisq.test(sw_le4$wateraccess, sw_le4$lifeexpectancy)
## Warning in chisq.test(sw_le4$wateraccess, sw_le4$lifeexpectancy): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: sw_le4$wateraccess and sw_le4$lifeexpectancy
## X-squared = 5994, df = 5928, p-value = 0.2709
Conditions Required to apply the Chi-Square Goodness of Fit Test:
· Observations recorded must be and used collected on a random basis. · There is no knowledge about population or parameters. · All the items in the sample must be independent. · Group should be contain greater than 10 factors. · Groups/numbers of items must be reasonably large (at least 30).
All conditions are met by this dataset.
sw_le4 %>%
ggplot(aes(wateraccess,lifeexpectancy)) +
geom_point()+
geom_smooth(method=lm,se=T)
## `geom_smooth()` using formula 'y ~ x'
p13b<-sw_le4 %>%
ggplot()+
geom_point(aes(x=wateraccess,y=lifeexpectancy,size=country, fill=continent), alpha=0.5)+
geom_smooth(aes(x=wateraccess, y=lifeexpectancy),se=FALSE, lwd=0.5, col="black")+
geom_abline(method=lm)+
lims(x=c(0, 100), y = c(0, 100))+
ggtitle("Life Expect. by Water Access")+
xlab("Clean Water Access (% Pop.)")+
ylab("Life Expectancy (years)")+
scale_fill_brewer()
## Warning: Ignoring unknown parameters: method
ggplotly(p13b)
## Warning: Using size for a discrete variable is not advised.
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
p13c<-sw_le4 %>%
ggplot()+
geom_point(aes(x=wateraccess,y=lifeexpectancy,size=country, fill=continent), alpha=0.5)+
geom_smooth(aes(x=wateraccess, y=lifeexpectancy),se=FALSE, lwd=0.35, col="black")+
geom_abline(method=lm, lwd = 0.25, col="darkgrey")+
lims(x=c(0, 100), y = c(0, 100))+
ggtitle("Life Expect. by Water Access")+
xlab("Clean Water Access (% Pop.)")+
ylab("Life Expectancy (years)")+
scale_fill_brewer()
## Warning: Ignoring unknown parameters: method
ggplotly(p13c)
## Warning: Using size for a discrete variable is not advised.
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
p14<-sw_le4 %>%
ggplot()+
geom_point(aes(x=wateraccess, y=lifeexpectancy, size=wateraccess, fill=continent, text = paste("Country:", country, '</br>', '</br>Clean Water Access:', wateraccess, '</br>Life Expectancy:', lifeexpectancy)), alpha=0.5)+
geom_smooth(aes(x=wateraccess, y=lifeexpectancy),se=FALSE, lwd=0.35, col="black")+
geom_abline(method=lm, lwd = 0.25, col="darkgrey")+
ggtitle("Life Expectancy & Clean Water Access")+
theme(plot.title = element_text(hjust = 0.5))+
labs(x="Clean Water Access (% Pop.)", y="Life Expectancy (years)", col="Continent", guide_legend="Continent")+
scale_fill_brewer()
## Warning: Ignoring unknown aesthetics: text
## Warning: Ignoring unknown parameters: method
ggplotly(p14, tooltip = "text")
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
The final plot is a Zoomed in and refined version of the plot above.
A circular point is provided for each individual country. The size of the circles visually represent access to clean water. The countries with the least access to clean water are shown in smallest ciricles. The countries with the greatest access to clean water are represented with the largest circles. The colors of the circles range represent the different continents on a blue spectrum. The countries in Africa, with the palest blue color, also tend to have the smallest circles - indicating that the African continent has the least access to clean water. The transparency of the blue circles has been adjusted to show the layering, particularly in the upper right hand corner.
A legend has been provided for the colored circles, which is arranged by Continent and displayed in alphabetical order. The legend was positioned vertically, to the right of the plot, and aligned to the bottom. I have not yet figured out how to adjust the name and location of the legend title, despite many, many attempts.
The plot includes a main title, which is centered.
The x-axis and y-axis labels are applied and centered as well. The legend has been organized by collegend coded by Continent alone.
A linear regression line is shown on the graph, which can be seen in full within plot 13b. The geom_smooth line has been shown in contrast to the regression line. An aesthetic choice was made to remove the Confidence Interval visualization around the geom_smooth line.
ggplotly was used to create an interactive plot. The tooltip text was edited so that when the mouse hovers over each individual country the reader is provided with the name of the country, the percentage of the population with access to clean water and the life expectancy rate. I did not attempt to limit the number of decimal places displayed, however my next step at further refinement would be to reduce the number of decimal places visible.
The statistical analysis present above produced conflicting information. The linear regression model produced a p-value of < 2.2e-16. Next to both variables, access to clean water and life expectancy, there were three asterics produced. The adjusted R-squared value of 0.5949 indicates that approximately 60% of the variance in the variables is explained by the relationship. These figures would suggest that life expectancy and access to clean water have a strong, stastically significant relationship.
However, the chi-square goodness-of-fit test was also conducted. That test resulted in a p-value = 0.2709. All of the required conditions for the chi-square test are met by this dataset. This figure would suggest that there is very weak evidence for a statistically significant relationship between clean water access and life expectancy.
One possible explanation is that the sample size population is so large in this case that the linear regression model is receiving skewed results due to a large n size. An effect size test should be performed to further explore the statistical significance of the relationship between clean water access and life expectancy.