library(readr)
Infection_Risk<- read_csv("Infection_Risk.csv")
## Parsed with column specification:
## cols(
##   ID = col_double(),
##   Stay = col_double(),
##   Age = col_double(),
##   InfctRsk = col_double(),
##   Cultures = col_double(),
##   Xrays = col_double(),
##   Beds = col_double(),
##   MedSchl = col_double(),
##   Region = col_double(),
##   Census = col_double(),
##   Nurses = col_double(),
##   Services = col_double()
## )
View(Infection_Risk)

Question 1

str(Infection_Risk)
## tibble [113 x 12] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ ID      : num [1:113] 1 2 3 4 5 6 7 8 9 10 ...
##  $ Stay    : num [1:113] 7.13 8.82 8.34 8.95 11.2 ...
##  $ Age     : num [1:113] 55.7 58.2 56.9 53.7 56.5 ...
##  $ InfctRsk: num [1:113] 4.1 1.6 2.7 5.6 5.7 ...
##  $ Cultures: num [1:113] 9 3.8 8.1 18.9 34.5 ...
##  $ Xrays   : num [1:113] 39.6 51.7 74 122.8 88.9 ...
##  $ Beds    : num [1:113] 279 80 107 147 180 150 186 640 182 85 ...
##  $ MedSchl : num [1:113] 2 2 2 2 2 2 2 1 2 2 ...
##  $ Region  : num [1:113] 4 2 3 4 1 2 3 2 3 1 ...
##  $ Census  : num [1:113] 207 51 82 53 134 147 151 399 130 59 ...
##  $ Nurses  : num [1:113] 241 52 54 148 151 106 129 360 118 66 ...
##  $ Services: num [1:113] 60 40 20 40 40 40 40 60 40 40 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   ID = col_double(),
##   ..   Stay = col_double(),
##   ..   Age = col_double(),
##   ..   InfctRsk = col_double(),
##   ..   Cultures = col_double(),
##   ..   Xrays = col_double(),
##   ..   Beds = col_double(),
##   ..   MedSchl = col_double(),
##   ..   Region = col_double(),
##   ..   Census = col_double(),
##   ..   Nurses = col_double(),
##   ..   Services = col_double()
##   .. )
mean(Infection_Risk$InfctRsk)
## [1] 4.354867
mean(Infection_Risk$Nurses)
## [1] 173.2478
max(Infection_Risk$InfctRsk)
## [1] 7.8
max(Infection_Risk$Nurses)
## [1] 656
min(Infection_Risk$InfctRsk)
## [1] 1.3
min(Infection_Risk$Nurses)
## [1] 14
sd(Infection_Risk$InfctRsk)
## [1] 1.340908
sd(Infection_Risk$Nurses)
## [1] 139.2654

Question 2

boxplot(Infection_Risk$InfctRsk)

boxplot(Infection_Risk$Nurses)

## Question 3

## The distribution for the variable infectionrisk is much more evenly distributed  vs the number of nurses in the hospital. With the former of the two variables having scores on both sides of the mean, with the number of nurses having more of a skewed distribution, with some outliers that fall mroe outside of the mean score.

Question 4

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
Infection_Risk %>%  
  filter(Age>60) %>% 
  summarize(mean=mean(InfctRsk, na.rm=T), sd=sd(InfctRsk, na.rm=T) )
## # A tibble: 1 x 2
##    mean    sd
##   <dbl> <dbl>
## 1  4.77  2.34

Question 4 Part 2

Infection_Risk %>%  
  filter(Age<50) %>% 
  summarize(mean=mean(InfctRsk, na.rm=T), sd=sd(InfctRsk, na.rm=T) )
## # A tibble: 1 x 2
##    mean    sd
##   <dbl> <dbl>
## 1  4.43  1.10

Question 5 PArt 1

Infection_Risk$Region<-factor(Infection_Risk$Region,
                   levels=c(1,2,3,4),
                   labels=c("Northeastern","Northcentral","South", "West" ))

Question 5 Part 2

summary(Infection_Risk$Region)
## Northeastern Northcentral        South         West 
##           28           32           37           16
library(readr)
library(dplyr)
Infection_Risk %>%  
  filter(InfctRsk>=1) %>% 
  group_by(Region) %>% 
  summarise(mean=mean(InfctRsk, na.rm=T), sd=sd(InfctRsk, na.rm=T) )
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 3
##   Region        mean    sd
##   <fct>        <dbl> <dbl>
## 1 Northeastern  4.86 1.27 
## 2 Northcentral  4.39 1.34 
## 3 South         3.93 1.46 
## 4 West          4.38 0.877

Question 5 Part 3

Infection_Risk %>%  
  filter(Nurses>=1) %>% 
  group_by(Region) %>% 
  summarise(mean=mean(Nurses, na.rm=T), sd=sd(Nurses, na.rm=T) )
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 3
##   Region        mean    sd
##   <fct>        <dbl> <dbl>
## 1 Northeastern  191.  146.
## 2 Northcentral  186.  134.
## 3 South         161.  136.
## 4 West          148.  152.

Bonus Question

##Calculate means
MeanInfctRsk<-mean(Infection_Risk$InfctRsk)
MeanNurses<-mean(Infection_Risk$Nurses)
##Calculate standard deviations
SDInfctRsk<-sd(Infection_Risk$InfctRsk)
SDNurses<-sd(Infection_Risk$Nurses)
##Identify number of records
IFRecords<-length(Infection_Risk$InfctRsk)
NRecords<-length(Infection_Risk$Nurses)
##Calculate error at 95% confidence interval
ErrorInfctRsk<-qnorm(0.975)*SDInfctRsk/sqrt(IFRecords)
ErrorNurses<-qnorm(0.975)*SDNurses/sqrt(NRecords)
##Calculate low end value of confidence interval
LeftInfctRsk<-MeanInfctRsk-ErrorInfctRsk
LeftNurses<-MeanNurses-ErrorNurses
##Calculate high end value of confidence interval
RightInfctRsk<-MeanInfctRsk+ErrorInfctRsk
RightNurses<-MeanNurses+ErrorNurses
##See Confidence interval values
print(LeftInfctRsk)
## [1] 4.107633
print(RightInfctRsk)
## [1] 4.602101
print(LeftNurses)
## [1] 147.5703
print(RightNurses)
## [1] 198.9252
## With 95% confidence we can say the mean scores for infection risk will fall in the ranges of 4.10 and 4.6. While for nurses in a hospital with 95% confidence the average number of nurses per hospital will fall between 147.57 and 198.92