library(readr)
library(haven)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(ipumsr)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
Infection_Risk_1_ <- read_csv("Infection_Risk(1).csv")
## Parsed with column specification:
## cols(
## ID = col_double(),
## Stay = col_double(),
## Age = col_double(),
## InfctRsk = col_double(),
## Cultures = col_double(),
## Xrays = col_double(),
## Beds = col_double(),
## MedSchl = col_double(),
## Region = col_double(),
## Census = col_double(),
## Nurses = col_double(),
## Services = col_double()
## )
View(Infection_Risk_1_)
#1. Conduct a descriptive analysis for the two variable and show the following statistics: mean, standard deviation, minimum, and maximum.
summary(Infection_Risk_1_$InfctRsk)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.300 3.700 4.400 4.355 5.200 7.800
summary(Infection_Risk_1_$Nurses)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.0 66.0 132.0 173.2 218.0 656.0
describe(Infection_Risk_1_$Nurses)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 113 173.25 139.27 132 151.96 103.78 14 656 642 1.34 1.35
## se
## X1 13.1
describe(Infection_Risk_1_$InfctRsk)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 113 4.35 1.34 4.4 4.38 1.19 1.3 7.8 6.5 -0.12 0.07 0.13
#2. Produce boxplots for these two variables and interpret these boxplots
boxplot(Infection_Risk_1_$InfctRsk, main="Infection Rate")
describe(Infection_Risk_1_$InfctRsk)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 113 4.35 1.34 4.4 4.38 1.19 1.3 7.8 6.5 -0.12 0.07 0.13
#Based on the boxplot, mean for infection rate is 4.35 (4.4) giving it a normal distribution. Since the 3rd Qu is 5.2 and the 1st Qu is 3.7, the IQR is therefore 1.5. The outliers (hosptials) indicate 3 outliers with high infection rates and 2 outliers with low infection rates.
#2…nurses boxplots
boxplot(Infection_Risk_1_$Nurses, main="Nurses")
summary(Infection_Risk_1_$Nurses)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.0 66.0 132.0 173.2 218.0 656.0
#Based on the boxplot, the mean is 173.2. The IQR is 152. There are 6 outliers, 4 hospitals hire 400+ nurses and 2 others hire 600+ nurses. The variation is wide between the hiring of nurses.
#3. Do these two variables have a normal distribution? Why or why not?
#Based on the information, yes for infection rate has a normal distribution but the normal distribution for nurses does not.
#The abline for infection rate indicates a symetrical distribution; the abline for nurses indicates a highly skewed distribution.
hist(Infection_Risk_1_$InfctRsk)
abline(v=mean(Infection_Risk_1_$InfctRsk), col="red")
hist(Infection_Risk_1_$Nurses)
abline(v=mean(Infection_Risk_1_$Nurses),col="red")
#4. Calculate the average infection risk for parents who are over 60 years old, and who are below 50 years old, respectively.
filter(Infection_Risk_1_, Age > "60")
## # A tibble: 6 x 12
## ID Stay Age InfctRsk Cultures Xrays Beds MedSchl Region Census Nurses
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 24 9.84 62.2 4.80 12 82.3 600 2 3 468 497
## 2 40 8.16 60.9 1.30 1.90 58 73 2 3 49 21
## 3 53 11.4 61.1 7.60 16.6 97.9 535 2 3 330 273
## 4 63 7.93 64.1 5.40 7.5 98.1 68 2 4 42 49
## 5 104 13.9 65.9 6.60 15.6 134. 356 2 1 308 182
## 6 106 10.8 63.9 2.90 1.60 57.4 130 2 3 69 62
## # … with 1 more variable: Services <dbl>
filter(Infection_Risk_1_, Age < "50")
## # A tibble: 21 x 12
## ID Stay Age InfctRsk Cultures Xrays Beds MedSchl Region Census Nurses
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 8 11.2 45.7 5.40 60.5 85.8 640 1 2 399 360
## 2 9 8.67 48.2 4.30 24.4 90.8 182 2 3 130 118
## 3 17 8.28 48.1 4.5 26 102. 108 2 4 84 73
## 4 21 7.53 42 4.20 23.1 98.9 95 2 4 47 49
## 5 22 10.2 49 4.80 36.3 113. 195 2 2 163 170
## 6 26 8.28 49.5 3.90 12 113. 546 1 2 413 436
## 7 27 9.31 47.2 4.5 30.2 101. 170 2 1 124 173
## 8 31 11.0 49.9 5 19.7 102. 318 2 1 270 335
## 9 38 7.84 49.1 4.60 7.10 87.9 60 2 3 50 45
## 10 43 11.2 45 3 7 78.9 130 2 3 95 56
## # … with 11 more rows, and 1 more variable: Services <dbl>
dat1 <- Infection_Risk_1_ %>%
filter(Age>60)
dat2 <- Infection_Risk_1_ %>%
filter(Age<50)
#5. What’s the average infection risk for each region? And What’s the average number of nurses for each region?
df3 <- data.frame(Infection_Risk_1_)
View(df3)
df3 <- subset(x=Infection_Risk_1_, select=c('InfctRsk', 'Region'))
df3 <- na.omit(df3)
df3 %>%
group_by(Region) %>%
summarize(mean=mean(InfctRsk))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 2
## Region mean
## <dbl> <dbl>
## 1 1 4.86
## 2 2 4.39
## 3 3 3.93
## 4 4 4.38
#6. Now, I’d like to make inferences about the population. What’s the confidence interval for InfctRsk and Nurses, respectively? Show your calculation.
#CI for InfctRsk
a <- 4.35
s <- 1.34
n <- 113
error <- qnorm(0.975)*s/sqrt(n)
left <- a-error
right <- a+error
print(c(left,right))
## [1] 4.102934 4.597066
#6 ….CI For Nurses
#CI for Nurses
a <- 173.25
s <- 139.27
n <- 113
error <- qnorm(0.975)*s/sqrt(n)
leftn <- a-error
rightn <- a+error
print(c(leftn,rightn))
## [1] 147.5717 198.9283
#7. Interpret the confidence intervals (CI). What do these CIs mean?
#CI for infection rate: 95% confident that mean infection rate is between 4.10 and 4.60
#CI for nurses: 95% confident that the mean for hiring nurses is between 147.60 and 198.92