library(readr)
library(haven)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(ipumsr)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
Infection_Risk_1_ <- read_csv("Infection_Risk(1).csv")
## Parsed with column specification:
## cols(
##   ID = col_double(),
##   Stay = col_double(),
##   Age = col_double(),
##   InfctRsk = col_double(),
##   Cultures = col_double(),
##   Xrays = col_double(),
##   Beds = col_double(),
##   MedSchl = col_double(),
##   Region = col_double(),
##   Census = col_double(),
##   Nurses = col_double(),
##   Services = col_double()
## )
View(Infection_Risk_1_)

#1. Conduct a descriptive analysis for the two variable and show the following statistics: mean, standard deviation, minimum, and maximum.

summary(Infection_Risk_1_$InfctRsk)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.300   3.700   4.400   4.355   5.200   7.800
summary(Infection_Risk_1_$Nurses)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    14.0    66.0   132.0   173.2   218.0   656.0
describe(Infection_Risk_1_$Nurses)
##    vars   n   mean     sd median trimmed    mad min max range skew kurtosis
## X1    1 113 173.25 139.27    132  151.96 103.78  14 656   642 1.34     1.35
##      se
## X1 13.1
describe(Infection_Risk_1_$InfctRsk)
##    vars   n mean   sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 113 4.35 1.34    4.4    4.38 1.19 1.3 7.8   6.5 -0.12     0.07 0.13

#2. Produce boxplots for these two variables and interpret these boxplots

boxplot(Infection_Risk_1_$InfctRsk, main="Infection Rate")

describe(Infection_Risk_1_$InfctRsk)
##    vars   n mean   sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 113 4.35 1.34    4.4    4.38 1.19 1.3 7.8   6.5 -0.12     0.07 0.13
#Based on the boxplot, mean for infection rate is 4.35 (4.4) giving it a normal distribution. Since the 3rd Qu is 5.2 and the 1st Qu is 3.7, the IQR is therefore 1.5. The outliers (hosptials) indicate 3 outliers with high infection rates and 2 outliers with low infection rates. 

#2…nurses boxplots

boxplot(Infection_Risk_1_$Nurses, main="Nurses")

summary(Infection_Risk_1_$Nurses)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    14.0    66.0   132.0   173.2   218.0   656.0
#Based on the boxplot, the mean is 173.2. The IQR is 152. There are 6 outliers, 4 hospitals hire 400+ nurses and 2 others hire 600+ nurses. The variation is wide between the hiring of nurses. 

#3. Do these two variables have a normal distribution? Why or why not?

#Based on the information, yes for infection rate has a normal distribution but the normal distribution for nurses does not. 
#The abline for infection rate indicates a symetrical distribution; the abline for nurses indicates a highly skewed distribution.

hist(Infection_Risk_1_$InfctRsk)
abline(v=mean(Infection_Risk_1_$InfctRsk), col="red")

hist(Infection_Risk_1_$Nurses)
abline(v=mean(Infection_Risk_1_$Nurses),col="red")

#4. Calculate the average infection risk for parents who are over 60 years old, and who are below 50 years old, respectively.

filter(Infection_Risk_1_, Age > "60")
## # A tibble: 6 x 12
##      ID  Stay   Age InfctRsk Cultures Xrays  Beds MedSchl Region Census Nurses
##   <dbl> <dbl> <dbl>    <dbl>    <dbl> <dbl> <dbl>   <dbl>  <dbl>  <dbl>  <dbl>
## 1    24  9.84  62.2     4.80    12     82.3   600       2      3    468    497
## 2    40  8.16  60.9     1.30     1.90  58      73       2      3     49     21
## 3    53 11.4   61.1     7.60    16.6   97.9   535       2      3    330    273
## 4    63  7.93  64.1     5.40     7.5   98.1    68       2      4     42     49
## 5   104 13.9   65.9     6.60    15.6  134.    356       2      1    308    182
## 6   106 10.8   63.9     2.90     1.60  57.4   130       2      3     69     62
## # … with 1 more variable: Services <dbl>
filter(Infection_Risk_1_, Age < "50")
## # A tibble: 21 x 12
##       ID  Stay   Age InfctRsk Cultures Xrays  Beds MedSchl Region Census Nurses
##    <dbl> <dbl> <dbl>    <dbl>    <dbl> <dbl> <dbl>   <dbl>  <dbl>  <dbl>  <dbl>
##  1     8 11.2   45.7     5.40    60.5   85.8   640       1      2    399    360
##  2     9  8.67  48.2     4.30    24.4   90.8   182       2      3    130    118
##  3    17  8.28  48.1     4.5     26    102.    108       2      4     84     73
##  4    21  7.53  42       4.20    23.1   98.9    95       2      4     47     49
##  5    22 10.2   49       4.80    36.3  113.    195       2      2    163    170
##  6    26  8.28  49.5     3.90    12    113.    546       1      2    413    436
##  7    27  9.31  47.2     4.5     30.2  101.    170       2      1    124    173
##  8    31 11.0   49.9     5       19.7  102.    318       2      1    270    335
##  9    38  7.84  49.1     4.60     7.10  87.9    60       2      3     50     45
## 10    43 11.2   45       3        7     78.9   130       2      3     95     56
## # … with 11 more rows, and 1 more variable: Services <dbl>
dat1 <- Infection_Risk_1_ %>% 
  filter(Age>60)


dat2 <- Infection_Risk_1_ %>% 
  filter(Age<50)

#5. What’s the average infection risk for each region? And What’s the average number of nurses for each region?

df3 <- data.frame(Infection_Risk_1_)
View(df3)

df3 <- subset(x=Infection_Risk_1_, select=c('InfctRsk', 'Region'))
df3 <- na.omit(df3)
df3 %>% 
  group_by(Region) %>% 
  summarize(mean=mean(InfctRsk))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 4 x 2
##   Region  mean
##    <dbl> <dbl>
## 1      1  4.86
## 2      2  4.39
## 3      3  3.93
## 4      4  4.38

#6. Now, I’d like to make inferences about the population. What’s the confidence interval for InfctRsk and Nurses, respectively? Show your calculation.

#CI for InfctRsk
a <- 4.35
s <- 1.34
n <- 113

error <- qnorm(0.975)*s/sqrt(n)
left <- a-error
right <- a+error
print(c(left,right))
## [1] 4.102934 4.597066

#6 ….CI For Nurses

#CI for Nurses
a <- 173.25
s <- 139.27
n <- 113



error <- qnorm(0.975)*s/sqrt(n)
leftn <- a-error
rightn <- a+error
print(c(leftn,rightn))
## [1] 147.5717 198.9283

#7. Interpret the confidence intervals (CI). What do these CIs mean?

  #CI for infection rate: 95% confident that mean infection rate is between 4.10 and 4.60
  #CI for nurses: 95% confident that the mean for hiring nurses is between 147.60 and 198.92