library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gapminder)
library(ggplot2)
library(readr)
COVID <- read_csv("~/Downloads/Skills Drill 2 COVID Survey Data.csv") %>%
select(Likelihood_Infected,Facemask_Wear,Household_Size)
## Parsed with column specification:
## cols(
## Likelihood_Infected = col_double(),
## Facemask_Wear = col_double(),
## Household_Size = col_double()
## )
head(COVID)
## # A tibble: 6 x 3
## Likelihood_Infected Facemask_Wear Household_Size
## <dbl> <dbl> <dbl>
## 1 4 1 2
## 2 2 0 1
## 3 2 1 3
## 4 2 1 7
## 5 0 1 3
## 6 2 1 4
Likelihood_Infected
0 = Not likely at all
1 = Not too likely
2 = Somewhat likely
3 = Very likely
4 =I have already contracted the virus.
EC (1pt): recode this Likelihood_Infected variable as an ordered factor
Facemask_Wear
0 = No
1 = Yes
COVID_New <- COVID%>%
mutate(Likelihood_Infected =ifelse(Likelihood_Infected==0,"Not likely at all",
ifelse(Likelihood_Infected==1,"Not too likely",
ifelse(Likelihood_Infected==2,"Somewhat likely",
ifelse(Likelihood_Infected==3,"Very likely",
ifelse(Likelihood_Infected==4," have already contracted the virus.",NA))))),
Likelihood_Infected = factor(Likelihood_Infected,
levels=c("Not likely at all","Not too likely","Somewhat likely","Very likely","have already contracted the virus.")),
Facemask_Wear = factor(ifelse(Facemask_Wear==0,"No",
ifelse(Facemask_Wear==1,"Yes",NA)),
levels=c("No","Yes")))
head(COVID_New)
## # A tibble: 6 x 3
## Likelihood_Infected Facemask_Wear Household_Size
## <fct> <fct> <dbl>
## 1 <NA> Yes 2
## 2 Somewhat likely No 1
## 3 Somewhat likely Yes 3
## 4 Somewhat likely Yes 7
## 5 Not likely at all Yes 3
## 6 Somewhat likely Yes 4
table(COVID_New$Likelihood_Infected,COVID_New$Facemask_Wear)%>%
prop.table(2)%>%
round(2)
##
## No Yes
## Not likely at all 0.17 0.14
## Not too likely 0.46 0.39
## Somewhat likely 0.27 0.33
## Very likely 0.09 0.14
## have already contracted the virus. 0.00 0.00
COVID_New%>%
filter(!is.na(Likelihood_Infected),!is.na(Facemask_Wear))%>%
group_by(Likelihood_Infected,Facemask_Wear)%>%
summarize(n=n())%>%
mutate(percent=n/sum(n))%>%
ggplot()+
geom_col(aes(x=Likelihood_Infected, y=percent, fill=Facemask_Wear))+
theme_minimal()
COVID_New%>%
group_by(Likelihood_Infected)%>%
summarize(AVGhousehold_size= mean(Household_Size))
## Warning: Factor `Likelihood_Infected` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 5 x 2
## Likelihood_Infected AVGhousehold_size
## <fct> <dbl>
## 1 Not likely at all NA
## 2 Not too likely NA
## 3 Somewhat likely 3.91
## 4 Very likely 4.04
## 5 <NA> 4.36
COVID_New%>%
filter(!is.na(Household_Size),!is.na(Likelihood_Infected))%>%
ggplot()+
geom_histogram(aes(x=Household_Size, fill=Likelihood_Infected))+
facet_wrap(~Likelihood_Infected)+
theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
### In this chart, it shows the different graphs that shows different categories of Likelihood of contracting COVID-19. In all of the 4 graphs, it seems the “Very likely” graph, shows the their is the least amount of count in terms of the 4 graphs, which the highest count would be about 11. The second least would be “Not likely at all”, which highest count would be 19. The third highest results of counts would be “Somewhat likely”, which the highest count is about 39, and the highest result would be “Not too likely”, which the highest count in there would be about 45. From all of the graph, it seems that it shows a normal distribution with all of the 4 graphs, where the graph’s highest point would be in the middle of the graphs and it would descrease at the end.