##load packages to use
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.0.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ipumsr)
## Warning: package 'ipumsr' was built under R version 4.0.2
library(readr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.2
library(psych)
## Warning: package 'psych' was built under R version 4.0.2
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(haven)
#Import the dataset
setwd("~/Documents/R_programming")
Infect_Risk<- read_csv("Infection_Risk.csv")
## Parsed with column specification:
## cols(
## ID = col_double(),
## Stay = col_double(),
## Age = col_double(),
## InfctRsk = col_double(),
## Cultures = col_double(),
## Xrays = col_double(),
## Beds = col_double(),
## MedSchl = col_double(),
## Region = col_double(),
## Census = col_double(),
## Nurses = col_double(),
## Services = col_double()
## )
#Summary Statistics for InfctRsk
summary(Infect_Risk$InfctRsk)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.300 3.700 4.400 4.355 5.200 7.800
sd(Infect_Risk$InfctRsk)
## [1] 1.340908
#Summary Statistics for Nurses
summary(Infect_Risk$Nurses)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.0 66.0 132.0 173.2 218.0 656.0
sd(Infect_Risk$Nurses)
## [1] 139.2654
#Box plot for infection risk
boxplot(Infect_Risk$InfctRsk)
# The Thick horizontal line in the box equates the second quartile which is also the median of the risk of infection variable(4.4). Also, the line above the thick line and below are the first (3.70) and Third (5.2)quartile respectively. Finally, the lower small circles points to the minimum value (1.3) while the small circles line above the box points to the maximum value (7.8)
#Box plot for Nurses
boxplot(Infect_Risk$Nurses)
# The Thick horizontal line in the box equates the second quartile which is also the median of the Nurses variable(132). Also, the line above the thick line and below are the first (66.0) and Third (218.0)quartile respectively. Finally,the small circles above the box points to the maximum value (656)
#calculate mean and median and showing the mode (highest frequency) with the histogram for infection risks
mean(Infect_Risk$InfctRsk,na.rm = TRUE)
## [1] 4.354867
median(Infect_Risk$InfctRsk,na.rm = TRUE)
## [1] 4.4
hist(Infect_Risk$InfctRsk, main="Risk of infection") # showing the Mode for the risk of infection Variable
#calculate mean and median and showing the mode (highest frequency) with the histogram for Nurses
mean(Infect_Risk$Nurses,na.rm = TRUE)
## [1] 173.2478
median(Infect_Risk$Nurses,na.rm = TRUE)
## [1] 132
hist(Infect_Risk$Nurses, main="Risk of infection")# showing the Mode for the Nurses Variable
# Note that we can categorically say that a data has a normal distribution if the mean, median and mode is approximately equal and has a very small standard deviation. For the two Variables under Consideration-infection risk the mean median and mode are approximately equal to each other and has a relatively small standard deviation, while for Nurses the mean median and mode are not approximately equal(see result from line 45-51). Consequently only variable- infection risk has a normal distribution. The Variable Nurses can be assumed not to have a normal distribution.
Infection_risk_60 <- filter(Infect_Risk, InfctRsk & Age>60) # Infection risk for parents who are over 60 years old
mean(Infection_risk_60$InfctRsk, na.rm= TRUE) # The average infection risk for parents who are over 60 years old
## [1] 4.766667
Infection_risk_50 <- filter(Infect_Risk, InfctRsk & Age<50) # Infection risk for parents who are below 50 years old
mean(Infection_risk_50$InfctRsk, na.rm= TRUE) # The average infection risk for parents who are over 60 years old
## [1] 4.428571
# Average infection risk for each region
Infect_Risk %>%
select(InfctRsk, Region) %>%
group_by(Region) %>%
summarize(mean=mean(InfctRsk, na.rm=T))
## `summarise()` ungrouping output (override with `.groups` argument)
# Average number of nurses for each region
Infect_Risk %>%
select(Nurses, Region) %>%
group_by(Region) %>%
summarize(mean=mean(Nurses, na.rm=T))
## `summarise()` ungrouping output (override with `.groups` argument)
# Calculating the Confidence Interval for Infection risks
a <- 4.355 # Mean of the sampled population
s <- 1.340908 # Standard deviation of the sample
n <- 113 # Sample size
error <- qnorm(0.975)*s/sqrt(n-1) # For the calculation of the standard error, i used n-1 because the population size is unknown
Lower_limit <- a-error
Upper_limit <- a+error
print(Lower_limit)
## [1] 4.106665
print(Upper_limit)
## [1] 4.603335
# Calculating the Confidence Interval for Nurses
a <- 173.2 # Mean of the sample population
s <- 139.2654 # Standard deviation of the sample
n <- 113 # Sample size
error <- qnorm(0.975)*s/sqrt(n-1) # For the calculation of the standard error, i used n-1 because the population size is unknown
Lower_limit_N <- a-error
Upper_limit_N <- a+error
print(Lower_limit_N)
## [1] 147.4082
print(Upper_limit_N)
## [1] 198.9918
# The level of certainty about the true Population mean is 95% in predicting that the true mean is within the interval between 4.11 and 4.60 knowing that the Risk of infection variable is normally distributed, and the samples are independent.
# The level of certainty about the true mean is 95% in predicting that the true mean is within the interval between 147.41 and 198.99 assuming that the variable-Nurses is normally distributed, and the samples are independent. Also since there is a value of one between the upper and lower limits, the sample may be to small to predict the population mean for the Nurse variable.