##load libraries
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v stringr 1.4.0
## v tidyr 1.1.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.0.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
library(lme4)
## Warning: package 'lme4' was built under R version 4.0.3
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##load data
library(readr)
PRB2013 <- read_csv("PRB2013.csv")
## Parsed with column specification:
## cols(
## .default = col_double(),
## Country = col_character(),
## Continent = col_character(),
## Region = col_character()
## )
## See spec(...) for full column specifications.
View(PRB2013)
prb<-read_csv("PRB2013.csv", col_names=T)
## Parsed with column specification:
## cols(
## .default = col_double(),
## Country = col_character(),
## Continent = col_character(),
## Region = col_character()
## )
## See spec(...) for full column specifications.
names(prb)<-tolower(names(prb))
prb_new<-prb%>% #we are creating a new dataframe called prb_new
mutate(Africa=ifelse(prb$continent=="Africa",yes= "Africa",no= "Not Africa")) # this new dataframe contains an additional dummy variable, Africa
#summary statistics by group using group_by
prb_new%>%
group_by(Africa)%>%
summarise(means=mean(tfr, na.rm=T), sds=sd(tfr, na.rm=T), n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 4
## Africa means sds n
## <chr> <dbl> <dbl> <int>
## 1 Africa 4.61 1.42 55
## 2 Not Africa 2.25 0.889 153
##Q1. What can we conclude from the summary statistics?
##The means are different. With African countries having a mean of 4.6 and the non-African countries having a mean of 2.25. This means that woman in African countries have a higher fertility rate compared to women from non- African countries. There is a difference of 2.35 per woman on average
##Q2. What can we conclude from the significant t-test?
#significant t-test
t.test(tfr~Africa, data=prb_new)
##
## Welch Two Sample t-test
##
## data: tfr by Africa
## t = 11.559, df = 69.813, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.954226 2.769268
## sample estimates:
## mean in group Africa mean in group Not Africa
## 4.612727 2.250980
##Similar to the summary statistics, the significant t-test shows that Africa ha a higher mean compared to non-African countries. The t-test also shows that the means between African countries and non-African countries are significantly different from each other. This is seen by looking at the t-statistic which is 11.59 and the degree of freedom which is 69.8
##Q3. Now that you see an example, now your turn. Please conduct a significant test to examine the difference in infant mortality (imr) between Asian countries and African countries.
cont2 <- prb %>%
transmute(
continent=ifelse(continent=="Africa", "Africa" ,
ifelse(continent=="Asia", "Asia", NA)),
imr=imr
)
View(cont2)
cont2 <- na.omit(cont2)
cont2 %>%
group_by(continent)%>%
summarise(mean=mean(imr, na.rm=TRUE), sd=sd(imr,na.rm = TRUE), n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 4
## continent mean sd n
## <chr> <dbl> <dbl> <int>
## 1 Africa 57.8 28.4 55
## 2 Asia 24.0 20.3 51
t.test(imr~continent, data=cont2)
##
## Welch Two Sample t-test
##
## data: imr by continent
## t = 7.0987, df = 97.84, p-value = 2.02e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 24.38592 43.31119
## sample estimates:
## mean in group Africa mean in group Asia
## 57.83091 23.98235
#a) provide summary statistics by group
#summary statistics by group using group_by
##Code above
cont2 %>%
group_by(continent)%>%
summarise(mean=mean(imr, na.rm=TRUE), sd=sd(imr,na.rm = TRUE), n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 4
## continent mean sd n
## <chr> <dbl> <dbl> <int>
## 1 Africa 57.8 28.4 55
## 2 Asia 24.0 20.3 51
#b) provide boxplots by group
boxplot(imr~continent, data=cont2, main="continent")
#c) conduct t-test. Make sure you interpret the results thoroughly
#significant t-test
t.test(imr~continent, data=cont2)
##
## Welch Two Sample t-test
##
## data: imr by continent
## t = 7.0987, df = 97.84, p-value = 2.02e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 24.38592 43.31119
## sample estimates:
## mean in group Africa mean in group Asia
## 57.83091 23.98235
##Based on the t-test, we see that the means are different from each other with Africa having a mean of 57.83091 and Asia a mean of 24. This also shows that countries in Africa have a higher infant mortality rate compared to countries in Asia. The t-test shows that the means are significantly different from each other. This is determined by examining the t-statistic which is 7.0987 and the 97.84 degrees of freedom
###Moving from two groups to multiple groups #summary statistics by group
prb%>%
group_by(continent)%>%
summarise(means=mean(tfr, na.rm=T), sds=sd(tfr, na.rm=T), n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
## continent means sds n
## <chr> <dbl> <dbl> <int>
## 1 Africa 4.61 1.42 55
## 2 Asia 2.52 1.03 51
## 3 Europe 1.55 0.228 45
## 4 North America 2.21 0.546 27
## 5 Oceania 3.18 0.901 17
## 6 South America 2.5 0.476 13
##Q6. Based on the output above, how many groups are there? Describe the table briefly.
##There are 6 groups.
##The table above shows the variance in means among the different continents.It shows that Africa has the highest fertility rate (4.6) and the Europe has the lowest fertility rates(1.5).There is a difference of 3 children per woman on average when comparing fertility rates in Africa and Europe.Therefore, the output shows that Africa has the overall highest fertility rates followed by Oceania,Asia,South America,North America and then Europe that has the lowest fertility rates.
##Q7. Conduct the Anova test, and explain how did we reach to the F-value of 57.379.
m1<-aov(formula=tfr~continent, data=prb)
anova(m1)
## Analysis of Variance Table
##
## Response: tfr
## Df Sum Sq Mean Sq F value Pr(>F)
## continent 5 266.61 53.322 57.379 < 2.2e-16 ***
## Residuals 202 187.72 0.929
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##We reach the F-value of 57.379 by dividing the variation between sample means by the variation within sample means
##Q8. Interpret the F-test (ANOVA test) results. Make sure you state the null and research hypotheses.
##Null hypothesis: there is no significant difference
##Research: At least one of the means is significantly different
##The results shows that there is at least some difference across the means of the different continents
###Simple Regression Analysis ##Using PSID data to examine the relationship between education (educ) and family income (adjfinc).
##Q9. What’s the mean value for education and family income, respectively?
library(haven)
PSID <- read_dta("stata_PSID_w1.dta")
View(PSID)
summary(PSID$educ)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 12.00 12.00 13.04 15.00 20.00 2496
summary(PSID$adjfinc)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -929.60 24.04 45.18 60.39 75.31 5044.84 48
##The mean value for educ is 13.04 and the adjfinc mean value is 60.39
##Q10. Estimate the relationship between education (X) and family income (Y).
Reg1<- lm(adjfinc~educ, data= PSID)
summary(Reg1)
##
## Call:
## lm(formula = adjfinc ~ educ, data = PSID)
##
## Residuals:
## Min 1Q Median 3Q Max
## -982.3 -31.8 -11.7 14.8 4962.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -36.7707 0.9331 -39.41 <2e-16 ***
## educ 7.4568 0.0696 107.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 78.13 on 128843 degrees of freedom
## (2516 observations deleted due to missingness)
## Multiple R-squared: 0.0818, Adjusted R-squared: 0.0818
## F-statistic: 1.148e+04 on 1 and 128843 DF, p-value: < 2.2e-16
with(Reg1,plot(PSID$educ,PSID$adjfinc))
abline(Reg1,col="pink")
#1) How would you write the linear regression equation?
## y= a+b*e
## y=-36.77+7.45*e
##y = -36.77+7.45
#2) Do you have any concerns that this model violates the regression assumptions?
##NO
##On the contrary it does not violet the regression assumptions but checks the boxes for the regression assumption
#3) What’s the R output of the regression analysis?
Reg1<- lm(adjfinc~educ, data= PSID)
summary(Reg1)
##
## Call:
## lm(formula = adjfinc ~ educ, data = PSID)
##
## Residuals:
## Min 1Q Median 3Q Max
## -982.3 -31.8 -11.7 14.8 4962.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -36.7707 0.9331 -39.41 <2e-16 ***
## educ 7.4568 0.0696 107.14 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 78.13 on 128843 degrees of freedom
## (2516 observations deleted due to missingness)
## Multiple R-squared: 0.0818, Adjusted R-squared: 0.0818
## F-statistic: 1.148e+04 on 1 and 128843 DF, p-value: < 2.2e-16
#4) How would you interpret the coefficient of education?
#for every value of increase in education, there is an expected increase in family income by $7,4568
#5) Show the analysis of variance table from this regression analysis.
anova(Reg1)
## Analysis of Variance Table
##
## Response: adjfinc
## Df Sum Sq Mean Sq F value Pr(>F)
## educ 1 70070052 70070052 11479 < 2.2e-16 ***
## Residuals 128843 786481569 6104
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#6) What’s the value of SSE? What does it mean?
##The SSE is 786481569 . The Sum of Square Error is the difference between the observed value and the predicted value. This means that there is a 786481569 difference between the observed value and the predicted value.
#1) How would you write the equation?
##y= 29.769+32.326xe
## y = 29.769+32.326
educ2<- PSID %>%
mutate (educc= ifelse(educ<= 8,
c("less highschool"), c("more highchool")))
summary(educ2)
## year sex age marpi
## Min. :2001 Length:131361 Min. : 1.00 Min. :0.0000
## 1st Qu.:2003 Class :character 1st Qu.: 14.00 1st Qu.:0.0000
## Median :2007 Mode :character Median : 29.00 Median :0.0000
## Mean :2006 Mean : 32.03 Mean :0.4178
## 3rd Qu.:2009 3rd Qu.: 47.00 3rd Qu.:1.0000
## Max. :2011 Max. :999.00 Max. :4.0000
## NA's :28
## educ adjfinc pubhs rnthlp
## Min. : 0.00 Min. :-929.60 Min. :0.00000 Min. :0.00000
## 1st Qu.:12.00 1st Qu.: 24.04 1st Qu.:0.00000 1st Qu.:0.00000
## Median :12.00 Median : 45.18 Median :0.00000 Median :0.00000
## Mean :13.04 Mean : 60.39 Mean :0.05301 Mean :0.02409
## 3rd Qu.:15.00 3rd Qu.: 75.31 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :20.00 Max. :5044.84 Max. :1.00000 Max. :1.00000
## NA's :2496 NA's :48 NA's :34 NA's :48
## adjwlth1 adjwlth2 h_race_ethnic_new id
## Min. :-2467.18 Min. :-2304.98 Length:131361 Min. : 4003
## 1st Qu.: 0.01 1st Qu.: 1.91 Class :character 1st Qu.:1269033
## Median : 9.98 Median : 32.80 Mode :character Median :2464171
## Mean : 129.48 Mean : 187.17 Mean :3014466
## 3rd Qu.: 58.05 3rd Qu.: 143.55 3rd Qu.:5381175
## Max. :80199.41 Max. :80303.23 Max. :6872185
## NA's :48 NA's :48
## race5 educc
## Min. :1.000 Length:131361
## 1st Qu.:3.000 Class :character
## Median :5.000 Mode :character
## Mean :3.927
## 3rd Qu.:5.000
## Max. :5.000
##
table(educ2$educc)
##
## less highschool more highchool
## 6594 122271
reg2<- lm(adjfinc~educc, data=educ2)
summary(reg2)
##
## Call:
## lm(formula = adjfinc ~ educc, data = educ2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -991.7 -36.0 -14.2 14.6 4982.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 29.769 1.001 29.75 <2e-16 ***
## educcmore highchool 32.326 1.027 31.47 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 81.22 on 128843 degrees of freedom
## (2516 observations deleted due to missingness)
## Multiple R-squared: 0.007627, Adjusted R-squared: 0.007619
## F-statistic: 990.3 on 1 and 128843 DF, p-value: < 2.2e-16
#2) What’s the R output of the regression analysis?
##see question 1
#3) How would you interpret the coefficient of education?
##On average, people with more than highschool education enjoy $32,326 more in family income than their counterparts with less than highschool education
##Or
##People with more highschool education are expected to earn 32.326 more in family income than those with less highschool education
##4 How would you interpret the intercept?
##On average, people with more highschool education are expected to enjoy an increase in family income by 29.7 compared to their counterparts with less highschool
##Or
##On average people with less than highschool education are expected to earn 29,769 in family income compared to their counterparts with more family income.
##Or
##People with less than highschool are expected to earn 29.7 less family income than their counterparts with more highschool