##load libraries

library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.3
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v stringr 1.4.0
## v tidyr   1.1.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(Hmisc)
## Warning: package 'Hmisc' was built under R version 4.0.3
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(lme4)
## Warning: package 'lme4' was built under R version 4.0.3
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

##load data

library(readr)
PRB2013 <- read_csv("PRB2013.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Country = col_character(),
##   Continent = col_character(),
##   Region = col_character()
## )
## See spec(...) for full column specifications.
View(PRB2013)
prb<-read_csv("PRB2013.csv", col_names=T)
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Country = col_character(),
##   Continent = col_character(),
##   Region = col_character()
## )
## See spec(...) for full column specifications.
names(prb)<-tolower(names(prb))   
prb_new<-prb%>% #we are creating a new dataframe called prb_new
 mutate(Africa=ifelse(prb$continent=="Africa",yes= "Africa",no= "Not Africa")) # this new dataframe contains an additional dummy variable, Africa

#summary statistics by group using group_by
prb_new%>%
 group_by(Africa)%>%
 summarise(means=mean(tfr, na.rm=T), sds=sd(tfr, na.rm=T), n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 4
##   Africa     means   sds     n
##   <chr>      <dbl> <dbl> <int>
## 1 Africa      4.61 1.42     55
## 2 Not Africa  2.25 0.889   153

##Q1. What can we conclude from the summary statistics?

##The means are different. With African countries having a mean of 4.6 and the non-African countries having a mean of 2.25. This means that woman in African countries have a higher fertility rate compared to women from non- African countries. There is a difference of 2.35 per woman on average

##Q2. What can we conclude from the significant t-test?

#significant t-test
t.test(tfr~Africa, data=prb_new)
## 
##  Welch Two Sample t-test
## 
## data:  tfr by Africa
## t = 11.559, df = 69.813, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.954226 2.769268
## sample estimates:
##     mean in group Africa mean in group Not Africa 
##                 4.612727                 2.250980
##Similar to the summary statistics, the significant t-test shows that Africa ha a higher mean compared to non-African countries. The t-test also shows that the means between African countries and non-African countries are significantly different from each other. This is seen by looking at the t-statistic which is 11.59 and the degree of freedom which is 69.8

##Q3. Now that you see an example, now your turn. Please conduct a significant test to examine the difference in infant mortality (imr) between Asian countries and African countries.

cont2 <- prb %>%
  transmute(
    continent=ifelse(continent=="Africa", "Africa" ,
                     ifelse(continent=="Asia", "Asia", NA)), 
    imr=imr
)
View(cont2)
cont2 <- na.omit(cont2)
cont2 %>% 
  group_by(continent)%>%
  summarise(mean=mean(imr, na.rm=TRUE), sd=sd(imr,na.rm = TRUE), n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 4
##   continent  mean    sd     n
##   <chr>     <dbl> <dbl> <int>
## 1 Africa     57.8  28.4    55
## 2 Asia       24.0  20.3    51
t.test(imr~continent, data=cont2)
## 
##  Welch Two Sample t-test
## 
## data:  imr by continent
## t = 7.0987, df = 97.84, p-value = 2.02e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  24.38592 43.31119
## sample estimates:
## mean in group Africa   mean in group Asia 
##             57.83091             23.98235

#a) provide summary statistics by group

#summary statistics by group using group_by


##Code above
cont2 %>% 
  group_by(continent)%>%
  summarise(mean=mean(imr, na.rm=TRUE), sd=sd(imr,na.rm = TRUE), n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 4
##   continent  mean    sd     n
##   <chr>     <dbl> <dbl> <int>
## 1 Africa     57.8  28.4    55
## 2 Asia       24.0  20.3    51

#b) provide boxplots by group

boxplot(imr~continent, data=cont2, main="continent")

#c) conduct t-test. Make sure you interpret the results thoroughly

#significant t-test
t.test(imr~continent, data=cont2)
## 
##  Welch Two Sample t-test
## 
## data:  imr by continent
## t = 7.0987, df = 97.84, p-value = 2.02e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  24.38592 43.31119
## sample estimates:
## mean in group Africa   mean in group Asia 
##             57.83091             23.98235
##Based on the t-test, we see that the means are different from each other with Africa having a mean of 57.83091 and Asia a mean of 24. This also shows that countries in Africa have a higher infant mortality rate compared to countries in Asia. The t-test shows that the means are significantly different from each other. This is determined by examining the t-statistic which is 7.0987 and the 97.84 degrees of freedom

###Moving from two groups to multiple groups #summary statistics by group

prb%>%
 group_by(continent)%>%
 summarise(means=mean(tfr, na.rm=T), sds=sd(tfr, na.rm=T), n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
##   continent     means   sds     n
##   <chr>         <dbl> <dbl> <int>
## 1 Africa         4.61 1.42     55
## 2 Asia           2.52 1.03     51
## 3 Europe         1.55 0.228    45
## 4 North America  2.21 0.546    27
## 5 Oceania        3.18 0.901    17
## 6 South America  2.5  0.476    13

##Q6. Based on the output above, how many groups are there? Describe the table briefly.

##There are 6 groups.
##The table above shows the variance in means among the different continents.It shows that Africa has the highest fertility rate (4.6) and the Europe has the lowest fertility rates(1.5).There is a difference of 3 children per woman on average when comparing fertility rates in Africa and Europe.Therefore, the output shows that Africa has the overall highest fertility rates followed by Oceania,Asia,South America,North America and then Europe that has the lowest fertility rates.

##Q7. Conduct the Anova test, and explain how did we reach to the F-value of 57.379.

m1<-aov(formula=tfr~continent, data=prb)
anova(m1)
## Analysis of Variance Table
## 
## Response: tfr
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## continent   5 266.61  53.322  57.379 < 2.2e-16 ***
## Residuals 202 187.72   0.929                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##We reach the F-value of 57.379 by dividing the variation between sample means by the variation within sample means

##Q8. Interpret the F-test (ANOVA test) results. Make sure you state the null and research hypotheses.

##Null hypothesis: there is no significant difference


##Research: At least one of the means is significantly different

##The results shows that there is at least some difference across the means of the different continents

###Simple Regression Analysis ##Using PSID data to examine the relationship between education (educ) and family income (adjfinc).

##Q9. What’s the mean value for education and family income, respectively?

library(haven)
PSID <- read_dta("stata_PSID_w1.dta")
View(PSID)

summary(PSID$educ)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00   12.00   12.00   13.04   15.00   20.00    2496
summary(PSID$adjfinc)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -929.60   24.04   45.18   60.39   75.31 5044.84      48
##The mean value for educ is 13.04 and the adjfinc mean value is 60.39

##Q10. Estimate the relationship between education (X) and family income (Y).

Reg1<- lm(adjfinc~educ, data= PSID)
summary(Reg1)
## 
## Call:
## lm(formula = adjfinc ~ educ, data = PSID)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -982.3  -31.8  -11.7   14.8 4962.3 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -36.7707     0.9331  -39.41   <2e-16 ***
## educ          7.4568     0.0696  107.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78.13 on 128843 degrees of freedom
##   (2516 observations deleted due to missingness)
## Multiple R-squared:  0.0818, Adjusted R-squared:  0.0818 
## F-statistic: 1.148e+04 on 1 and 128843 DF,  p-value: < 2.2e-16
with(Reg1,plot(PSID$educ,PSID$adjfinc))
abline(Reg1,col="pink")

#1) How would you write the linear regression equation?

## y= a+b*e

## y=-36.77+7.45*e
##y = -36.77+7.45

#2) Do you have any concerns that this model violates the regression assumptions?

##NO
##On the contrary it does not violet the regression assumptions but checks the boxes for the regression assumption

#3) What’s the R output of the regression analysis?

Reg1<- lm(adjfinc~educ, data= PSID)
summary(Reg1)
## 
## Call:
## lm(formula = adjfinc ~ educ, data = PSID)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -982.3  -31.8  -11.7   14.8 4962.3 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -36.7707     0.9331  -39.41   <2e-16 ***
## educ          7.4568     0.0696  107.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78.13 on 128843 degrees of freedom
##   (2516 observations deleted due to missingness)
## Multiple R-squared:  0.0818, Adjusted R-squared:  0.0818 
## F-statistic: 1.148e+04 on 1 and 128843 DF,  p-value: < 2.2e-16

#4) How would you interpret the coefficient of education?

#for every value of increase in education, there is an expected increase in family income by $7,4568 

#5) Show the analysis of variance table from this regression analysis.

anova(Reg1)
## Analysis of Variance Table
## 
## Response: adjfinc
##               Df    Sum Sq  Mean Sq F value    Pr(>F)    
## educ           1  70070052 70070052   11479 < 2.2e-16 ***
## Residuals 128843 786481569     6104                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#6) What’s the value of SSE? What does it mean?

##The SSE is 786481569 . The Sum of Square Error is the difference between the observed value and the predicted value. This means that there is a 786481569 difference between the observed value and the predicted value.

Q12. (Bonus). Estimate the relationship between education levels (more than high school>12 years versus equal or less than high school<=12) and family income (Y).

#1) How would you write the equation?

##y=  29.769+32.326xe

## y = 29.769+32.326

educ2<- PSID %>%
 mutate (educc= ifelse(educ<= 8,
                               c("less highschool"), c("more highchool")))
summary(educ2)
##       year          sex                 age             marpi       
##  Min.   :2001   Length:131361      Min.   :  1.00   Min.   :0.0000  
##  1st Qu.:2003   Class :character   1st Qu.: 14.00   1st Qu.:0.0000  
##  Median :2007   Mode  :character   Median : 29.00   Median :0.0000  
##  Mean   :2006                      Mean   : 32.03   Mean   :0.4178  
##  3rd Qu.:2009                      3rd Qu.: 47.00   3rd Qu.:1.0000  
##  Max.   :2011                      Max.   :999.00   Max.   :4.0000  
##                                                     NA's   :28      
##       educ          adjfinc            pubhs             rnthlp       
##  Min.   : 0.00   Min.   :-929.60   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.:12.00   1st Qu.:  24.04   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median :12.00   Median :  45.18   Median :0.00000   Median :0.00000  
##  Mean   :13.04   Mean   :  60.39   Mean   :0.05301   Mean   :0.02409  
##  3rd Qu.:15.00   3rd Qu.:  75.31   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :20.00   Max.   :5044.84   Max.   :1.00000   Max.   :1.00000  
##  NA's   :2496    NA's   :48        NA's   :34        NA's   :48       
##     adjwlth1           adjwlth2        h_race_ethnic_new        id         
##  Min.   :-2467.18   Min.   :-2304.98   Length:131361      Min.   :   4003  
##  1st Qu.:    0.01   1st Qu.:    1.91   Class :character   1st Qu.:1269033  
##  Median :    9.98   Median :   32.80   Mode  :character   Median :2464171  
##  Mean   :  129.48   Mean   :  187.17                      Mean   :3014466  
##  3rd Qu.:   58.05   3rd Qu.:  143.55                      3rd Qu.:5381175  
##  Max.   :80199.41   Max.   :80303.23                      Max.   :6872185  
##  NA's   :48         NA's   :48                                             
##      race5          educc          
##  Min.   :1.000   Length:131361     
##  1st Qu.:3.000   Class :character  
##  Median :5.000   Mode  :character  
##  Mean   :3.927                     
##  3rd Qu.:5.000                     
##  Max.   :5.000                     
## 
table(educ2$educc)
## 
## less highschool  more highchool 
##            6594          122271
reg2<- lm(adjfinc~educc, data=educ2)
summary(reg2)
## 
## Call:
## lm(formula = adjfinc ~ educc, data = educ2)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -991.7  -36.0  -14.2   14.6 4982.7 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           29.769      1.001   29.75   <2e-16 ***
## educcmore highchool   32.326      1.027   31.47   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 81.22 on 128843 degrees of freedom
##   (2516 observations deleted due to missingness)
## Multiple R-squared:  0.007627,   Adjusted R-squared:  0.007619 
## F-statistic: 990.3 on 1 and 128843 DF,  p-value: < 2.2e-16

#2) What’s the R output of the regression analysis?

##see question 1

#3) How would you interpret the coefficient of education?

##On average, people with more than highschool education enjoy $32,326 more in family income than their counterparts with less than highschool education
##Or
##People with more highschool education are expected to earn 32.326 more in family income than those with less highschool education

##4 How would you interpret the intercept?

##On average, people with more highschool education are expected to enjoy an increase in family income by 29.7 compared to their counterparts with less highschool

##Or

##On average people with less than highschool education are expected to earn 29,769 in family income compared to their counterparts with more family income.

##Or

##People with less than highschool are expected to earn 29.7 less family income than their counterparts with more highschool