library(haven)
library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(ipumsr)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
mydata<- read_csv("C:/Users/chris/Downloads/Infection_Risk(1).csv")
## Parsed with column specification:
## cols(
##   ID = col_double(),
##   Stay = col_double(),
##   Age = col_double(),
##   InfctRsk = col_double(),
##   Cultures = col_double(),
##   Xrays = col_double(),
##   Beds = col_double(),
##   MedSchl = col_double(),
##   Region = col_double(),
##   Census = col_double(),
##   Nurses = col_double(),
##   Services = col_double()
## )
View(mydata)

Question 1.Conduct a descriptive analysis for the two variables and show the following statistics: mean, standard deviation, minimum, and maximum

assignment2<-subset(mydata,select=c("InfctRsk","Nurses"))
View(assignment2)
describe(assignment2$InfctRsk)
##    vars   n mean   sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 113 4.35 1.34    4.4    4.38 1.19 1.3 7.8   6.5 -0.12     0.07 0.13
describe(assignment2$Nurses)
##    vars   n   mean     sd median trimmed    mad min max range skew kurtosis
## X1    1 113 173.25 139.27    132  151.96 103.78  14 656   642 1.34     1.35
##      se
## X1 13.1

Question 2. Produce boxplots for these two variables and interpret these boxplots

boxplot(assignment2$InfctRsk, main="Boxplot for InfctRsk")

boxplot(assignment2$Nurses, main="Boxplot for Nurses")

The boxplot for infctRsk (infection risk at the hospital) has outliers that are very condense, describes the minimum value, quartle 1 value, the median value, quartile 3 value, and the maximum value. Just like the boxplot for infctRsk, the boxplot for Nurses also has outliers, however, i can observe that there is a greater variability with the boxplot for nurses as well as larger outliers. Therefore,i can conclude that their median do differ.The median divides the observation into half. That is, half the scores are greater than or equal to this value and half are less. The quarter 1 divides the lower 25% of the data while the quarter 3 divides the upper 25% of observation from the data. The maximum value represent the upper limits of the observations which are not outliers. With reference to the minimum value for the InfctRsk the minimum relates to the limits observations which are not outliers.

Question 3. Do these two variables have a normal distribution? Why or why not?

Both boxplots are not normally distributed. Though the boxplot for InfctRsk has a median which is in the middle of the box, the whiskers are not about the same on both sides of the box. As a result, the distribution is not symmetric. The boxplot for nurses on the other hand have a distribution that is positively skewed because the median is closer to the bottom of the box.

Question 4. Calculate the average infection risk for parents who are over 60 years old, and who are below 50 years old, respectively.

parentAge<- subset(mydata, select=c("InfctRsk","Age"))
parentsover60<- subset(parentAge,Age>60,na.rm = T)
View(parentsover60)
mean(parentsover60$InfctRsk)
## [1] 4.766667
parentsbelow50<-subset(parentAge,Age<50,na.rm =T)
View(parentsbelow50)
mean(parentsbelow50$InfctRsk)
## [1] 4.428571

Question 5.What’s the average infection risk for each region? And What’s the average number of nurses for each region?

wadi<- subset(mydata, select=c("InfctRsk","Region","Nurses"))
View(wadi)
Region<-data.frame(wadi)
Region %>%
  group_by(Region) %>% 
  summarize(mean=mean(InfctRsk))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 2
##   Region  mean
##    <dbl> <dbl>
## 1      1  4.86
## 2      2  4.39
## 3      3  3.93
## 4      4  4.38
Region %>%
  group_by(Region)%>%
  summarize(mean=mean(Nurses, na.rm=T))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 2
##   Region  mean
##    <dbl> <dbl>
## 1      1  191.
## 2      2  186.
## 3      3  161.
## 4      4  148.

Question 6. What’s the confidence interval for InfctRsk and Nurses, respectively? Show your calculation.

## CI for InfctRsk
a <- 4.35
s <- 1.34
n <- 113

error <- qnorm(0.975)*s/sqrt(n)
left <- a-error
right <- a+error
print(c(left,right))
## [1] 4.102934 4.597066
## CI for Nurses

a <- 173.25
s <- 139.27
n <- 113

error <- qnorm(0.975)*s/sqrt(n)
leftn <- a-error
rightn <- a+error
print(c(leftn,rightn))
## [1] 147.5717 198.9283

The CI for the infection rate at the hospital mean that we are 95% confident that the true mean lie between the interval 4.1 and 4.6.

For the variable Nurses, we are 95% certain that the true mean is between the interval between 147.6 and 198.9.