#Load Libraries 
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(lme4)
## Loading required package: Matrix
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:Hmisc':
## 
##     src, summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ipumsr)
library(readr)
library(psych)
## 
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(haven)

Q1 What can we conclude from the summary statistics?

prb<-read_csv("C:/Users/codar/OneDrive/Documents/Stats 1/Data/PRB2013.csv", col_names=T)
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Country = col_character(),
##   Continent = col_character(),
##   Region = col_character()
## )
## See spec(...) for full column specifications.
names(prb)<-tolower(names(prb))   
prb_new<-prb%>% 
 mutate(Africa=ifelse(prb$continent=="Africa",yes= "Africa",no= "Not Africa")) 
prb_new%>%
 group_by(Africa)%>%
 summarise(means=mean(tfr, na.rm=T), sds=sd(tfr, na.rm=T), n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
t.test(tfr~Africa, data=prb_new)

Q1. What can we conclude from the summary statistics

#The mean fertility rate for Africa is 4.61 with a standard deviation of 1.42, while the mean fertility rate for the rest of the world is 2.25 with a standard deviation of 0.889. There is more variation in the data for Africa (sd=1.42) versus the variation in the data for other countries (sd=0.889). Women in African countries, on average, have a higher fertility rate than those in non-African countries.

Q2. What can we conclude from the significant t-test?

#Our t-statistic of -11.559 does not fall within the rejection zone determined by a critical value of 2.0. I obtained the critical value by looking at the t-table and used the degrees of freedom from the results of the t-test, in addition to a .05 confidence level. In this example, we fail to reject the null hypothesis, which states that there is no difference in means between the two groups. The alternative hypothesis is that there is a difference in means between the true groups. In addition, according to the results of the t-test, we can be 95% confident that the true mean of the two groups is within -0.1595010 and 0.01246549.

Q3. Now that you see an example, now your turn. Please conduct a significant test to examine the difference in infant mortality (imr) between Asian countries and African countries. a) provide summary statistics by group

prb_3a <- prb %>%
  transmute(
    continent=ifelse(continent=="Africa", "Africa" ,
                     ifelse(continent=="Asia", "Asia", NA)), 
    imr=imr
  )
View(prb_3a)
prb_3a <- na.omit(prb_3a)
prb_3a %>% 
  group_by(continent)%>%
  summarize(mean=mean(imr,na.rm=TRUE), sd=sd(imr, na.rm = TRUE), n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 4
##   continent  mean    sd     n
##   <chr>     <dbl> <dbl> <int>
## 1 Africa     57.8  28.4    55
## 2 Asia       24.0  20.3    51
#to run a t.test
t.test(imr~continent, data=prb_3a)
## 
##  Welch Two Sample t-test
## 
## data:  imr by continent
## t = 7.0987, df = 97.84, p-value = 2.02e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  24.38592 43.31119
## sample estimates:
## mean in group Africa   mean in group Asia 
##             57.83091             23.98235
#Interpret Results 
#We can be 95% confident that the true difference in means will fall between 24.38592 and 43.3119
  1. provide boxplots by group
#This gives me all the continents
boxplot(imr~continent, data=prb_new)

#This gives me a true/false
boxplot(imr~continent=="Asia", imr~continent=="Africa", data=prb_new)
#Same as above
boxplot(imr~continent=="Asia", data = prb_new)

#Got it here using the ggplot package
e <- ggplot(prb_new, aes(x = continent, y = imr))
e + geom_boxplot() + 
  scale_x_discrete(limits=c("Africa", "Asia"))
## Warning: Removed 102 rows containing missing values (stat_boxplot).

  1. conduct t-test. Make sure you interpret the results thoroughly
t.test(imr~continent, data=prb_3a)
## 
##  Welch Two Sample t-test
## 
## data:  imr by continent
## t = 7.0987, df = 97.84, p-value = 2.02e-10
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  24.38592 43.31119
## sample estimates:
## mean in group Africa   mean in group Asia 
##             57.83091             23.98235

Q6. How many groups are there? Describe the table briefly.

prb_new%>%
 group_by(continent)%>%
 summarise(means=mean(tfr, na.rm=T), sds=sd(tfr, na.rm=T), n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
##   continent     means   sds     n
##   <chr>         <dbl> <dbl> <int>
## 1 Africa         4.61 1.42     55
## 2 Asia           2.52 1.03     51
## 3 Europe         1.55 0.228    45
## 4 North America  2.21 0.546    27
## 5 Oceania        3.18 0.901    17
## 6 South America  2.5  0.476    13
#There are six groups.The table provides the means, standard deviations, and the number of observations per group. 

Q7. Conduct the Anova test, and explain how did we reach to the F-value of 57.379.

c1 <- aov(tfr~continent, data=prb_new)
anova(c1)
## Analysis of Variance Table
## 
## Response: tfr
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## continent   5 266.61  53.322  57.379 < 2.2e-16 ***
## Residuals 202 187.72   0.929                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#To obtain the F-Statistic we divide the between-groups MS by the within-groups MS. In this case, we divide 53.322 by 0.929 to get the F-Statistic of 57.379. 

Q8. Interpret the F-test (ANOVA test) results. Make sure you state the null and research hypotheses.

#Null Hypothesis: There is no difference in fertility rates between the continents 
#Alternative Hypothesis: There is a difference in fertility rates between the continents 


#The ANOVA results indicate that the p-value of <2.2e-16 is smaller than the 0.5, which means that we can reject the null hypothesis which states that there is no difference in fertility rates between the continent groups. 

#Additionally, the F value of 53.322 is much larger than the critical f value I obtained from looking at the F distribution table. This means that the bigger values of F means that the between-groups variation is large, relative to the within-groups variation. This gives us more evidence to reject the null hypothesis. 

Q9. Using PSID data to examine the relationship between education (educ) and family income (adjfinc). What’s the mean value for education and family income, respectively?

psid <- read_dta("~/Stats 1/Data/stata_PSID_w1.dta")
View(psid)

mean(psid$educ, na.rm = T)
## [1] 13.03642
mean(psid$adjfinc, na.rm=T)
## [1] 60.39013

Q10. Estimate the relationship between education (X) and family income (Y). 1) How would you write the linear regression equation?

re1 <- lm(adjfinc~educ, data=psid)
summary(re1)
## 
## Call:
## lm(formula = adjfinc ~ educ, data = psid)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -982.3  -31.8  -11.7   14.8 4962.3 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -36.7707     0.9331  -39.41   <2e-16 ***
## educ          7.4568     0.0696  107.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78.13 on 128843 degrees of freedom
##   (2516 observations deleted due to missingness)
## Multiple R-squared:  0.0818, Adjusted R-squared:  0.0818 
## F-statistic: 1.148e+04 on 1 and 128843 DF,  p-value: < 2.2e-16
#The linear regression equation would be written as follows:
#y=-36.7707+7.4568x+e
  1. Do you have any concerns that this model violates the regression assumptions?
#No. 
  1. What’s the R output of the regression analysis?
re1 <- lm(adjfinc~educ, data=psid)
summary(re1)
## 
## Call:
## lm(formula = adjfinc ~ educ, data = psid)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -982.3  -31.8  -11.7   14.8 4962.3 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -36.7707     0.9331  -39.41   <2e-16 ***
## educ          7.4568     0.0696  107.14   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 78.13 on 128843 degrees of freedom
##   (2516 observations deleted due to missingness)
## Multiple R-squared:  0.0818, Adjusted R-squared:  0.0818 
## F-statistic: 1.148e+04 on 1 and 128843 DF,  p-value: < 2.2e-16
with(psid, plot(educ, adjfinc))
abline(re1, col="red")

  1. How would you interpret the coefficient of education?
#Coefficient implies that there is a positive linear relationship between educ and ajdinc. The coefficient of education is 7.4568, which means that for every unit increase in education, the adjfinc increases by $7,457. 
  1. Show the analysis of variance table from this regression analysis.
anova(re1)
## Analysis of Variance Table
## 
## Response: adjfinc
##               Df    Sum Sq  Mean Sq F value    Pr(>F)    
## educ           1  70070052 70070052   11479 < 2.2e-16 ***
## Residuals 128843 786481569     6104                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  1. What’s the value of SSE? What does it mean?
#786481569 = SSE. The sum of square error is the sum of the squared differences between the predicted and the observed values.It is a measure of the total variances between the predicted and observed values. 

EXTRA WORK: PLOTTING RESIDUALS | Fitness Testing

#Created a residuals object then plotted residuals
re1.res = resid(re1)
plot(re1.res)

#Plotted residuals against observations
plot(fitted(re1),re1.res)
abline(re1, col="red")

#A QQ Plot
qqnorm(re1.res)
qqline(re1.res)

#DEnsity PLot 
plot(density(re1.res))

hist(resid(re1))

boxplot(residuals(re1))

Q12. (Bonus). Estimate the relationship between education levels (more than high school>12 years versus equal or less than high school<=12) and family income (Y). 1) How would you write the equation? 2) What’s the R output of the regression analysis? 3) How would you interpret the coefficient of education? 4) How would you interpret the intercept?

#First mutate 
eduin <- psid %>%   
  mutate (educ1= ifelse(educ<=8,
                        c("lhs"), c("mhs")))
#1. y=29.769 + 32.326x
#2. see output from lm below. 
r2 <- lm(adjfinc~educ1, data=eduin)
summary(r2)
## 
## Call:
## lm(formula = adjfinc ~ educ1, data = eduin)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -991.7  -36.0  -14.2   14.6 4982.7 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   29.769      1.001   29.75   <2e-16 ***
## educ1mhs      32.326      1.027   31.47   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 81.22 on 128843 degrees of freedom
##   (2516 observations deleted due to missingness)
## Multiple R-squared:  0.007627,   Adjusted R-squared:  0.007619 
## F-statistic: 990.3 on 1 and 128843 DF,  p-value: < 2.2e-16
#3. The coefficient of education tells us that a person with more than high school education, on average, will earn about $62,095 on average or $32,326 more than a person with less than or equal to high school education. 

#4.The intercept tells us that a person with less than or equal to a high school education, will earn, on average, $29,769.