#Load Libraries
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
library(lme4)
## Loading required package: Matrix
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
##
## src, summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ipumsr)
library(readr)
library(psych)
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(haven)
Q1 What can we conclude from the summary statistics?
prb<-read_csv("C:/Users/codar/OneDrive/Documents/Stats 1/Data/PRB2013.csv", col_names=T)
## Parsed with column specification:
## cols(
## .default = col_double(),
## Country = col_character(),
## Continent = col_character(),
## Region = col_character()
## )
## See spec(...) for full column specifications.
names(prb)<-tolower(names(prb))
prb_new<-prb%>%
mutate(Africa=ifelse(prb$continent=="Africa",yes= "Africa",no= "Not Africa"))
prb_new%>%
group_by(Africa)%>%
summarise(means=mean(tfr, na.rm=T), sds=sd(tfr, na.rm=T), n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
t.test(tfr~Africa, data=prb_new)
Q1. What can we conclude from the summary statistics
#The mean fertility rate for Africa is 4.61 with a standard deviation of 1.42, while the mean fertility rate for the rest of the world is 2.25 with a standard deviation of 0.889. There is more variation in the data for Africa (sd=1.42) versus the variation in the data for other countries (sd=0.889). Women in African countries, on average, have a higher fertility rate than those in non-African countries.
Q2. What can we conclude from the significant t-test?
#Our t-statistic of -11.559 does not fall within the rejection zone determined by a critical value of 2.0. I obtained the critical value by looking at the t-table and used the degrees of freedom from the results of the t-test, in addition to a .05 confidence level. In this example, we fail to reject the null hypothesis, which states that there is no difference in means between the two groups. The alternative hypothesis is that there is a difference in means between the true groups. In addition, according to the results of the t-test, we can be 95% confident that the true mean of the two groups is within -0.1595010 and 0.01246549.
Q3. Now that you see an example, now your turn. Please conduct a significant test to examine the difference in infant mortality (imr) between Asian countries and African countries. a) provide summary statistics by group
prb_3a <- prb %>%
transmute(
continent=ifelse(continent=="Africa", "Africa" ,
ifelse(continent=="Asia", "Asia", NA)),
imr=imr
)
View(prb_3a)
prb_3a <- na.omit(prb_3a)
prb_3a %>%
group_by(continent)%>%
summarize(mean=mean(imr,na.rm=TRUE), sd=sd(imr, na.rm = TRUE), n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 4
## continent mean sd n
## <chr> <dbl> <dbl> <int>
## 1 Africa 57.8 28.4 55
## 2 Asia 24.0 20.3 51
#to run a t.test
t.test(imr~continent, data=prb_3a)
##
## Welch Two Sample t-test
##
## data: imr by continent
## t = 7.0987, df = 97.84, p-value = 2.02e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 24.38592 43.31119
## sample estimates:
## mean in group Africa mean in group Asia
## 57.83091 23.98235
#Interpret Results
#We can be 95% confident that the true difference in means will fall between 24.38592 and 43.3119
#This gives me all the continents
boxplot(imr~continent, data=prb_new)
#This gives me a true/false
boxplot(imr~continent=="Asia", imr~continent=="Africa", data=prb_new)
#Same as above
boxplot(imr~continent=="Asia", data = prb_new)
#Got it here using the ggplot package
e <- ggplot(prb_new, aes(x = continent, y = imr))
e + geom_boxplot() +
scale_x_discrete(limits=c("Africa", "Asia"))
## Warning: Removed 102 rows containing missing values (stat_boxplot).
t.test(imr~continent, data=prb_3a)
##
## Welch Two Sample t-test
##
## data: imr by continent
## t = 7.0987, df = 97.84, p-value = 2.02e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 24.38592 43.31119
## sample estimates:
## mean in group Africa mean in group Asia
## 57.83091 23.98235
Q6. How many groups are there? Describe the table briefly.
prb_new%>%
group_by(continent)%>%
summarise(means=mean(tfr, na.rm=T), sds=sd(tfr, na.rm=T), n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
## continent means sds n
## <chr> <dbl> <dbl> <int>
## 1 Africa 4.61 1.42 55
## 2 Asia 2.52 1.03 51
## 3 Europe 1.55 0.228 45
## 4 North America 2.21 0.546 27
## 5 Oceania 3.18 0.901 17
## 6 South America 2.5 0.476 13
#There are six groups.The table provides the means, standard deviations, and the number of observations per group.
Q7. Conduct the Anova test, and explain how did we reach to the F-value of 57.379.
c1 <- aov(tfr~continent, data=prb_new)
anova(c1)
## Analysis of Variance Table
##
## Response: tfr
## Df Sum Sq Mean Sq F value Pr(>F)
## continent 5 266.61 53.322 57.379 < 2.2e-16 ***
## Residuals 202 187.72 0.929
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#To obtain the F-Statistic we divide the between-groups MS by the within-groups MS. In this case, we divide 53.322 by 0.929 to get the F-Statistic of 57.379.
Q8. Interpret the F-test (ANOVA test) results. Make sure you state the null and research hypotheses.
#Null Hypothesis: There is no difference in fertility rates between the continents
#Alternative Hypothesis: There is a difference in fertility rates between the continents
#The ANOVA results indicate that the p-value of <2.2e-16 is smaller than the 0.5, which means that we can reject the null hypothesis which states that there is no difference in fertility rates between the continent groups.
#Additionally, the F value of 53.322 is much larger than the critical f value I obtained from looking at the F distribution table. This means that the bigger values of F means that the between-groups variation is large, relative to the within-groups variation. This gives us more evidence to reject the null hypothesis.
Q9. Using PSID data to examine the relationship between education (educ) and family income (adjfinc). What’s the mean value for education and family income, respectively?
psid <- read_dta("~/Stats 1/Data/stata_PSID_w1.dta")
View(psid)
mean(psid$educ, na.rm = T)
## [1] 13.03642
mean(psid$adjfinc, na.rm=T)
## [1] 60.39013
Q10. Estimate the relationship between education (X) and family income (Y). 1) How would you write the linear regression equation?
re1 <- lm(adjfinc~educ, data=psid)
summary(re1)
##
## Call:
## lm(formula = adjfinc ~ educ, data = psid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -982.3 -31.8 -11.7 14.8 4962.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -36.7707 0.9331 -39.41 <2e-16 ***
## educ 7.4568 0.0696 107.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 78.13 on 128843 degrees of freedom
## (2516 observations deleted due to missingness)
## Multiple R-squared: 0.0818, Adjusted R-squared: 0.0818
## F-statistic: 1.148e+04 on 1 and 128843 DF, p-value: < 2.2e-16
#The linear regression equation would be written as follows:
#y=-36.7707+7.4568x+e
#No.
re1 <- lm(adjfinc~educ, data=psid)
summary(re1)
##
## Call:
## lm(formula = adjfinc ~ educ, data = psid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -982.3 -31.8 -11.7 14.8 4962.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -36.7707 0.9331 -39.41 <2e-16 ***
## educ 7.4568 0.0696 107.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 78.13 on 128843 degrees of freedom
## (2516 observations deleted due to missingness)
## Multiple R-squared: 0.0818, Adjusted R-squared: 0.0818
## F-statistic: 1.148e+04 on 1 and 128843 DF, p-value: < 2.2e-16
with(psid, plot(educ, adjfinc))
abline(re1, col="red")
#Coefficient implies that there is a positive linear relationship between educ and ajdinc. The coefficient of education is 7.4568, which means that for every unit increase in education, the adjfinc increases by $7,457.
anova(re1)
## Analysis of Variance Table
##
## Response: adjfinc
## Df Sum Sq Mean Sq F value Pr(>F)
## educ 1 70070052 70070052 11479 < 2.2e-16 ***
## Residuals 128843 786481569 6104
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#786481569 = SSE. The sum of square error is the sum of the squared differences between the predicted and the observed values.It is a measure of the total variances between the predicted and observed values.
EXTRA WORK: PLOTTING RESIDUALS | Fitness Testing
#Created a residuals object then plotted residuals
re1.res = resid(re1)
plot(re1.res)
#Plotted residuals against observations
plot(fitted(re1),re1.res)
abline(re1, col="red")
#A QQ Plot
qqnorm(re1.res)
qqline(re1.res)
#DEnsity PLot
plot(density(re1.res))
hist(resid(re1))
boxplot(residuals(re1))
Q12. (Bonus). Estimate the relationship between education levels (more than high school>12 years versus equal or less than high school<=12) and family income (Y). 1) How would you write the equation? 2) What’s the R output of the regression analysis? 3) How would you interpret the coefficient of education? 4) How would you interpret the intercept?
#First mutate
eduin <- psid %>%
mutate (educ1= ifelse(educ<=8,
c("lhs"), c("mhs")))
#1. y=29.769 + 32.326x
#2. see output from lm below.
r2 <- lm(adjfinc~educ1, data=eduin)
summary(r2)
##
## Call:
## lm(formula = adjfinc ~ educ1, data = eduin)
##
## Residuals:
## Min 1Q Median 3Q Max
## -991.7 -36.0 -14.2 14.6 4982.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.769 1.001 29.75 <2e-16 ***
## educ1mhs 32.326 1.027 31.47 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 81.22 on 128843 degrees of freedom
## (2516 observations deleted due to missingness)
## Multiple R-squared: 0.007627, Adjusted R-squared: 0.007619
## F-statistic: 990.3 on 1 and 128843 DF, p-value: < 2.2e-16
#3. The coefficient of education tells us that a person with more than high school education, on average, will earn about $62,095 on average or $32,326 more than a person with less than or equal to high school education.
#4.The intercept tells us that a person with less than or equal to a high school education, will earn, on average, $29,769.