#First thing’s first, load the libraries!

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ipumsr)
library(readr)
library(ggplot2)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(haven)

#Please download the data from Blackboard (Infection_Risk.csv) to answer the following questions. The data set gives a number of characteristics for 113 hospitals across the nation. The original purpose is to understand the factors for infection risk for patients while in the hospital. In this assignment, we primarily use two variables: InfctRsk and Nurses. The former refers to the risk of being infected in a hospital and the latter indicates the number of nurses in a hospital.

infection <- read_csv(file="Infection_Risk(1).csv")
## Parsed with column specification:
## cols(
##   ID = col_double(),
##   Stay = col_double(),
##   Age = col_double(),
##   InfctRsk = col_double(),
##   Cultures = col_double(),
##   Xrays = col_double(),
##   Beds = col_double(),
##   MedSchl = col_double(),
##   Region = col_double(),
##   Census = col_double(),
##   Nurses = col_double(),
##   Services = col_double()
## )
infection
print(infection)
## # A tibble: 113 x 12
##       ID  Stay   Age InfctRsk Cultures Xrays  Beds MedSchl Region Census Nurses
##    <dbl> <dbl> <dbl>    <dbl>    <dbl> <dbl> <dbl>   <dbl>  <dbl>  <dbl>  <dbl>
##  1     1  7.13  55.7     4.10     9     39.6   279       2      4    207    241
##  2     2  8.82  58.2     1.60     3.80  51.7    80       2      2     51     52
##  3     3  8.34  56.9     2.70     8.10  74     107       2      3     82     54
##  4     4  8.95  53.7     5.60    18.9  123.    147       2      4     53    148
##  5     5 11.2   56.5     5.70    34.5   88.9   180       2      1    134    151
##  6     6  9.76  50.9     5.10    21.9   97     150       2      2    147    106
##  7     7  9.68  57.8     4.60    16.7   79     186       2      3    151    129
##  8     8 11.2   45.7     5.40    60.5   85.8   640       1      2    399    360
##  9     9  8.67  48.2     4.30    24.4   90.8   182       2      3    130    118
## 10    10  8.84  56.3     6.30    29.6   82.6    85       2      1     59     66
## # ... with 103 more rows, and 1 more variable: Services <dbl>
View(infection)
summary(infection)
##        ID           Stay             Age           InfctRsk        Cultures    
##  Min.   :  1   Min.   : 6.700   Min.   :38.80   Min.   :1.300   Min.   : 1.60  
##  1st Qu.: 29   1st Qu.: 8.340   1st Qu.:50.90   1st Qu.:3.700   1st Qu.: 8.40  
##  Median : 57   Median : 9.420   Median :53.20   Median :4.400   Median :14.10  
##  Mean   : 57   Mean   : 9.648   Mean   :53.23   Mean   :4.355   Mean   :15.79  
##  3rd Qu.: 85   3rd Qu.:10.470   3rd Qu.:56.20   3rd Qu.:5.200   3rd Qu.:20.30  
##  Max.   :113   Max.   :19.560   Max.   :65.90   Max.   :7.800   Max.   :60.50  
##      Xrays             Beds          MedSchl         Region     
##  Min.   : 39.60   Min.   : 29.0   Min.   :1.00   Min.   :1.000  
##  1st Qu.: 69.50   1st Qu.:106.0   1st Qu.:2.00   1st Qu.:2.000  
##  Median : 82.30   Median :186.0   Median :2.00   Median :2.000  
##  Mean   : 81.63   Mean   :252.2   Mean   :1.85   Mean   :2.363  
##  3rd Qu.: 94.10   3rd Qu.:312.0   3rd Qu.:2.00   3rd Qu.:3.000  
##  Max.   :133.50   Max.   :835.0   Max.   :2.00   Max.   :4.000  
##      Census          Nurses         Services    
##  Min.   : 20.0   Min.   : 14.0   Min.   : 5.70  
##  1st Qu.: 68.0   1st Qu.: 66.0   1st Qu.:31.40  
##  Median :143.0   Median :132.0   Median :42.90  
##  Mean   :191.4   Mean   :173.2   Mean   :43.16  
##  3rd Qu.:252.0   3rd Qu.:218.0   3rd Qu.:54.30  
##  Max.   :791.0   Max.   :656.0   Max.   :80.00
names(infection)
##  [1] "ID"       "Stay"     "Age"      "InfctRsk" "Cultures" "Xrays"   
##  [7] "Beds"     "MedSchl"  "Region"   "Census"   "Nurses"   "Services"
dim(infection)
## [1] 113  12

#1. Please conduct a descriptive analysis for the two variables and show the following statistics: mean, standard deviation, minimum, and maximum

summary(infection$InfctRsk)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.300   3.700   4.400   4.355   5.200   7.800
sd(infection$InfctRsk)
## [1] 1.340908
#Or, I also learned that I can obtain all of the information using this 'describe' command. I shared it with my peers.
describe(infection$InfctRsk)  
summary(infection$Nurses)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    14.0    66.0   132.0   173.2   218.0   656.0
sd(infection$Nurses)
## [1] 139.2654
describe(infection$Nurses)

#2. Produce boxplots for these two variables and interpret these boxplots (20)

boxplot(infection$InfctRsk, main = "Infection Risk")

boxplot(infection$Nurses, main = "Nurses")

IQR(infection$InfctRsk)
## [1] 1.5
IQR(infection$Nurses)
## [1] 152
#In the boxplot for "Infection Risk", the IQR is 1.5, the 1st Qu is 3.75, Median 4.4 and the 3rd Qu 5.2. There are outliers beyond the max of 7.8. 
#In the boxplot for "Nurses", the IQR is 152, the 1st Qu is 66, Median is 132, and the 3rd Qu is 218. There are outliers beyond the max of 656. (Note: The IQR is the difference between the 1st and 3rd Quartiles.) 

#3. Do these two variables have a normal distribution? Why or why not? (20)

hist(infection$InfctRsk)

hist(infection$Nurses)

hist(infection$InfctRsk)
abline(v=mean(infection$InfctRsk),col="red")

hist(infection$Nurses)
abline(v=mean(infection$Nurses),col="red")

#By appearance only, it may be assumed that the variable "InfctRisk" has a normal distribution, while the distribution of the variable "Nurses" is not standard. This assumption can be made given the typical bell curve shape and symmetry of the standard normal distribution. Moreover, once the mean of the sample is plotted using the albine function, it shows the mean for each histogram. In a normal distribution there is symmetry around the mean, which is not the case for the "nurses" variable, but is more apparent in the "InfctRisk" histogram. 

#4. Calculate the average infection risk for parents who are over 60 years old, and who are below 50 years old, respectively. (20)

#To answer this, I created a subset with two variables 
infage2<- subset(x=infection, select=c("InfctRsk", "Age"))
print(infage2)
## # A tibble: 113 x 2
##    InfctRsk   Age
##       <dbl> <dbl>
##  1     4.10  55.7
##  2     1.60  58.2
##  3     2.70  56.9
##  4     5.60  53.7
##  5     5.70  56.5
##  6     5.10  50.9
##  7     4.60  57.8
##  8     5.40  45.7
##  9     4.30  48.2
## 10     6.30  56.3
## # ... with 103 more rows
View(infage2)


#then subset for Over 60
Over60 <- subset(infage2, Age > 60, na.rm = TRUE)  
View(Over60)
mean(Over60$InfctRsk)
## [1] 4.766667
summary(Over60)
##     InfctRsk          Age       
##  Min.   :1.300   Min.   :60.90  
##  1st Qu.:3.375   1st Qu.:61.38  
##  Median :5.100   Median :63.05  
##  Mean   :4.767   Mean   :63.02  
##  3rd Qu.:6.300   3rd Qu.:64.05  
##  Max.   :7.600   Max.   :65.90
#And one for under 50
Under50 <- subset(infage2, Age < 50, na.rm = TRUE)
mean(Under50$InfctRsk)
## [1] 4.428571
summary(Over60)
##     InfctRsk          Age       
##  Min.   :1.300   Min.   :60.90  
##  1st Qu.:3.375   1st Qu.:61.38  
##  Median :5.100   Median :63.05  
##  Mean   :4.767   Mean   :63.02  
##  3rd Qu.:6.300   3rd Qu.:64.05  
##  Max.   :7.600   Max.   :65.90

#5. What’s the average infection risk for each region? And What’s the average number of nurses for each region? (20)

#First I created a subset with region and infection rate
Regioninfect<- subset(x=infection, select=c('InfctRsk', 'Region')) 
View(Regioninfect)

#Then, I created a data frame
regdataframe <- data.frame(Regioninfect)
View(regdataframe)

#Then, I used the data frame to answer the mean by group 
regdataframe %>%
    group_by(Region) %>%
    summarize(mean=mean(InfctRsk, na.rm = TRUE))  
## `summarise()` ungrouping output (override with `.groups` argument)

#6. Now, I’d like to make inferences about the population. What’s the confidence interval for InfctRsk and Nurses, respectively? Show your calculation. (bonus 10)

#To obtain the CI for InfctRsk:
a <- 4.35
s <- 1.34
n <- 112
error <- qnorm(0.975)*s/sqrt(n)
left95 <- a-error
right95 <- a+error
left95
## [1] 4.101833
right95
## [1] 4.598167
print (c(left95, right95))
## [1] 4.101833 4.598167
#To obtain the CI for Nurses: 
a <- 173.2
s <- 139.27
n <- 112
error <- qnorm(0.975)*s/sqrt(n)
leftn <- a-error
rightn <- a+error
leftn
## [1] 147.4073
rightn
## [1] 198.9927
print (c(leftn, rightn))
## [1] 147.4073 198.9927

#7. Interpret the confidence intervals (CI). What do these CIs mean? (bonus 10)

#Interpretation for InftRsk CI: 
#I am 95% confident that the population mean for InfctRsk is between -4.101833 and 4.598167. Therefore, if we repeat a sampling over and over, and establish confidence intervals for these samples, the population mean will fall in 95% of these confidence intervals. 

#I am 95% confident that the population mean for Nurses is between -147.4073 and 198.9927 Therefore, if we repeat a sampling over and over, and establish confidence intervals for these samples, the population mean will fall in 95% of these confidence intervals. 

When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file). The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.