Dem 7273 Due date is November 11, 2020 Monday by 5pm.
Instruction: Read the codes and questions carefully. Open a dataset called PRB that’s available under homework folder on blackboard. We would like to know whether total fertility rate (tfr) differs significantly between Africa and the rest of the world. Please type your answers, including my original code in a R-markdown file. When you submit, please submit the Rpub link.
library(ggplot2)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(broom)
prb <- read_csv("PRB2013.csv",col_names=T)
## Parsed with column specification:
## cols(
## .default = col_double(),
## Country = col_character(),
## Continent = col_character(),
## Region = col_character()
## )
## See spec(...) for full column specifications.
names(prb)<- tolower(names(prb))
prb_new<-prb%>% #we are creating a new dataframe called prb_new
mutate(Africa=ifelse(prb$continent=="Africa",yes= "Africa",no= "Not Africa")) # this new dataframe contains an additional dummy variable, Africa
#summary statistics by group using group_by
prb_new%>%
group_by(Africa)%>%
summarise(means=mean(tfr, na.rm=T), sds=sd(tfr, na.rm=T), n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 4
## Africa means sds n
## <chr> <dbl> <dbl> <int>
## 1 Africa 4.61 1.42 55
## 2 Not Africa 2.25 0.889 153
#significant t-test
t.test(tfr~Africa, data=prb_new)
##
## Welch Two Sample t-test
##
## data: tfr by Africa
## t = 11.559, df = 69.813, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.954226 2.769268
## sample estimates:
## mean in group Africa mean in group Not Africa
## 4.612727 2.250980
Q1. What can we conclude from the summary statistics?
The mean total fertility rates of countries not in Africa are lower are lower than those in Africa. African TFR’s also have more variance than non-African countries, illustrated by the higher standard deviation.
Q2. What can we conclude from the significant t-test?
The results of the t-test show a t-statistic score of 11.59 with a p-value of less than 0.05, these suggest a rejection in the null hypothesis and that the true difference in means is not zero.
Q3. Now that you see an example, now your turn. Please conduct a significant test to examine the difference in infant mortality (imr) between Asian countries and African countries.
prb_new%>%
group_by(continent)%>%
summarise(means=mean(imr, na.rm=T), sds=sd(imr, na.rm=T), n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
## continent means sds n
## <chr> <dbl> <dbl> <int>
## 1 Africa 57.8 28.4 55
## 2 Asia 24.0 20.3 51
## 3 Europe 4.73 2.76 45
## 4 North America 15.5 11.0 27
## 5 Oceania 22.2 14.1 17
## 6 South America 18.7 9.67 13
prb1<- prb_new%>%
mutate(continent_id= ifelse(continent == "Africa", 1,
ifelse(continent == "Asia", 2,
ifelse(continent == "Europe", 3,
ifelse(continent == "North America", 4,
ifelse(continent == "Oceania", 5,
ifelse(continent == "South America", 6, NA)))))))%>%
filter(continent_id <= 2)
print(prb1)
## # A tibble: 106 x 35
## id country continent region year population cbr cdr rate_of_natural~
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 115 Afghan~ Asia South~ 2013 30.6 37 8 2.8
## 2 1 Algeria Africa NORTH~ 2013 38.3 26 5 2.2
## 3 43 Angola Africa MIDDL~ 2013 21.6 47 15 3.2
## 4 97 Armenia Asia Weste~ 2013 3 14 10 0.4
## 5 98 Azerba~ Asia Weste~ 2013 9.4 19 6 1.3
## 6 99 Bahrain Asia Weste~ 2013 1.1 14 2 1.2
## 7 116 Bangla~ Asia South~ 2013 157. 21 6 1.5
## 8 8 Benin Africa WESTE~ 2013 9.6 39 10 2.9
## 9 117 Bhutan Asia South~ 2013 0.7 22 7 1.5
## 10 52 Botswa~ Africa SOUTH~ 2013 1.9 24 17 0.7
## # ... with 96 more rows, and 26 more variables: net_migration_rate <dbl>,
## # projectedpopmid2025 <dbl>, projectedpopmid2050 <dbl>,
## # projectedpopchange_08_50perc <dbl>, imr <dbl>, tfr <dbl>,
## # percpoplt15 <dbl>, percpopgt65 <dbl>, e0total <dbl>, e0male <dbl>,
## # e0female <dbl>, percurban <dbl>, percpop1549hivaids1995 <dbl>,
## # percpop1549hivaids2011_2013 <dbl>, percmarwomcontraall <dbl>,
## # percmarwomcontramodern <dbl>, popsquarekm <dbl>,
## # gnippppercapitausdollars2012 <dbl>, gdpgrowth2000_2006 <dbl>,
## # gdpgrowth2007_2011 <dbl>, percshareofincome_poor5 <dbl>,
## # percshareofincome_wealthiest5 <dbl>, percpopimpropersanitation_urban <dbl>,
## # percpopimpropersanitation_rural <dbl>, Africa <chr>, continent_id <dbl>
attach(prb1)
boxplot(imr~continent)
t.test(imr~continent)
##
## Welch Two Sample t-test
##
## data: imr by continent
## t = 7.0987, df = 97.84, p-value = 2.02e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 24.38592 43.31119
## sample estimates:
## mean in group Africa mean in group Asia
## 57.83091 23.98235
#summary statistics by group
prb%>%
group_by(continent)%>%
summarise(means=mean(tfr, na.rm=T), sds=sd(tfr, na.rm=T), n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
## continent means sds n
## <chr> <dbl> <dbl> <int>
## 1 Africa 4.61 1.42 55
## 2 Asia 2.52 1.03 51
## 3 Europe 1.55 0.228 45
## 4 North America 2.21 0.546 27
## 5 Oceania 3.18 0.901 17
## 6 South America 2.5 0.476 13
Q6. Based on the output above, how many groups are there? Describe the table briefly.
The table shows the mean tfr, standard deviation of tfr, and the number of countries within each of the six continents.
Q7. Conduct the Anova test, and explain how did we reach to the F-value of 57.379.
names(prb)<-tolower(names(prb))
fit<-lm(tfr~continent, data=prb)
anova(fit)
## Analysis of Variance Table
##
## Response: tfr
## Df Sum Sq Mean Sq F value Pr(>F)
## continent 5 266.61 53.322 57.379 < 2.2e-16 ***
## Residuals 202 187.72 0.929
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The F value was found by dividing the mean squares between groups by the mean squares within groups. 53.322/0.929
Q8. Interpret the F-test (ANOVA test) results. Make sure you state the null and research hypotheses.
null hypothesis : The difference in mean tfr between continents is equal to zero. research hypothesis : At least one continent will have a different mean tfr.
Given that the F-value is greater than the critical value, the null hypothesis would be rejected in favor of the alternative.
##Simple Regression Analysis
Using PSID data to examine the relationship between education (educ) and family income (adjfinc).
Q9. What’s the mean value for education and family income, respectively?
library(haven)
psid <- read_dta("psid2013.dta")
View(psid)
mean(psid$educ, na.rm = T)
## [1] 13.37999
mean(psid$adjfinc)
## [1] 54.36537
The mean value for years of education completed and family income are 13.4 and 54.4 respectively.
Q10. Estimate the relationship between education (X) and family income (Y).
fin<-lm(adjfinc~educ, data = psid)
summary(fin)
##
## Call:
## lm(formula = adjfinc ~ educ, data = psid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -101.74 -28.94 -10.36 15.13 1751.25
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -41.1346 1.7674 -23.27 <2e-16 ***
## educ 7.1438 0.1285 55.59 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 62.07 on 22959 degrees of freedom
## (173 observations deleted due to missingness)
## Multiple R-squared: 0.1186, Adjusted R-squared: 0.1186
## F-statistic: 3091 on 1 and 22959 DF, p-value: < 2.2e-16
The linear regression equation can be represented as : Yi = alpha + Beta Xi + ei
In this case the equation can be written as: family income = -41.1 + 7.1(years of eudcation) + ei
plot(fin, which=1)
plot(fin)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(fin)
##
## studentized Breusch-Pagan test
##
## data: fin
## BP = 130.2, df = 1, p-value < 2.2e-16
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
ncvTest(fin)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 11515.8, Df = 1, p = < 2.22e-16
After running a few diagnostics, I’m concerned that at least the assumption of homoscedasticity, or constant variance of error, is not met.
summary(fin)
##
## Call:
## lm(formula = adjfinc ~ educ, data = psid)
##
## Residuals:
## Min 1Q Median 3Q Max
## -101.74 -28.94 -10.36 15.13 1751.25
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -41.1346 1.7674 -23.27 <2e-16 ***
## educ 7.1438 0.1285 55.59 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 62.07 on 22959 degrees of freedom
## (173 observations deleted due to missingness)
## Multiple R-squared: 0.1186, Adjusted R-squared: 0.1186
## F-statistic: 3091 on 1 and 22959 DF, p-value: < 2.2e-16
ggplot(psid, aes(x=educ, y=adjfinc))+geom_point()+geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 173 rows containing non-finite values (stat_smooth).
## Warning: Removed 173 rows containing missing values (geom_point).
coef(fin)
## (Intercept) educ
## -41.134633 7.143759
library(ggpubr)
Each additional year of education is associated with a 7,140 increase in family income.
anova(fin)
## Analysis of Variance Table
##
## Response: adjfinc
## Df Sum Sq Mean Sq F value Pr(>F)
## educ 1 11907473 11907473 3090.8 < 2.2e-16 ***
## Residuals 22959 88450801 3853
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
SSE is the residual sum of squares due to error. In this case the SSE is 88450801