library(readr)
library(haven)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(haven)
library(knitr)
PA_Mort <- read_dta("PA_Mortality.dta")
View(PA_Mort)
#6 a) Generate a boxplot of poverty rate at the county level (2 points). Based on the boxplot, what is the median poverty rate and the interquartile range (IQR) of the poverty rate? (2 points) What’s the minimum and maximum values for the poverty rate? (4 points)
boxplot(PA_Mort$povrate, main="Poverty Rate at County Level")
summary(PA_Mort$povrate)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.04873 0.09667 0.12455 0.12110 0.14199 0.24159
#The median poverty rate is 0.12. The IQR is .04532. The minimum value for poverty rate is 0.04873; the maximum value for poverty rate is 0.24159.
#6 b) Is the distribution of poverty rate normally distributed? Why or why not? Describe how you reach to your conclusion. (4 points)
hist(PA_Mort$povrate)
abline(v=mean(PA_Mort$povrate), col="green", lwd=4)
#The distribution of poverty rate is normally distributed. Based on the boxplot and histogram, the mean for poverty rate is 0.12 and 0.12 for median,indicated by the green abline,giving it a normal distribution.
#6 c) Please create two binary variables based on avemort and gini. For the former, please recode those less than or equal to 8 as “Low Mortality”, otherwise “High Mortality.” For the latter, those less than or equal to 0.4 should be coded as “Equal”, otherwise, “Unequal.” (8 points)
#note to self: create a subset to isolate the variables
mortgin=subset(PA_Mort, select = c(avemort,gini))
#now we can create the binary variables
mortgin$avemort <- (ifelse(PA_Mort$avemort<=8, yes="low", no="high"))
table(mortgin$avemort)
##
## high low
## 52 15
mortgin$gini <- (ifelse(PA_Mort$gini>0.4, "equal", "Unequal"))
table(mortgin$gini)
##
## equal Unequal
## 56 11
#6 d) How many counties have high mortality? And how many counties have “unequal” gini coefficient? (8 points)
table(mortgin$avemort)
##
## high low
## 52 15
table(mortgin$gini)
##
## equal Unequal
## 56 11
#52 counties have high mortality; 11 counties have unequal gini coefficient.
#6 e) Show the confidence intervals for gini coefficients when county mortality level is low and high, respectively.
mortgin2 <- subset(PA_Mort,select = c(avemort,gini))
giniii <- data.frame(mortgin2)
giniii %>%
group_by(avemort <=8) %>%
summarise(mean(gini), sd(gini))
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 3
## `avemort <= 8` `mean(gini)` `sd(gini)`
## <lgl> <dbl> <dbl>
## 1 FALSE 0.420 0.0234
## 2 TRUE 0.422 0.0234
#based on Scott's notes for CI....
#giniii
a <- 0.4200577
s <- 0.02342817
n <- 52
error <- qnorm(0.95)*s/sqrt(n)
leftn <- a-error
rightn <- a+error
leftn
## [1] 0.4147137
rightn
## [1] 0.4254017
print (c(leftn, rightn))
## [1] 0.4147137 0.4254017
#giniii
a <- 0.4218000
s <- 0.02341612
n <- 15
error <- qnorm(0.95)*s/sqrt(n)
leftn <- a-error
rightn <- a+error
leftn
## [1] 0.4118552
rightn
## [1] 0.4317448
print (c(leftn, rightn))
## [1] 0.4118552 0.4317448
#i. Based on the data the confidence intervals overlap.
#ii.The confidence intervals indicate that 95% of the true population mean will fall between 0.4118552 and 0.4317448 for the gini coefficient when the counties have low mortality; 0.4147137 and 0.4254017 confidence intervals with high mortality.
#iii.Based on the confidence interval data and the fact that they overlap; there is not a significant difference or evidence to indicate significant difference between high and low mortality counties.
#6 f) Knit your R markdown file and provide your Rpubs link. (bonus 10 points)
#bet