library(readr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
prb2013 <- read_csv("C:/Users/josha/Desktop/Fall 2020 Stats 1 Dem/WorkingDirectoryFall2020StatsDem1/PRB2013.csv", col_names=T)
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Country = col_character(),
##   Continent = col_character(),
##   Region = col_character()
## )
## See spec(...) for full column specifications.
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.3     v stringr 1.4.0
## v tidyr   1.1.1     v forcats 0.5.0
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:lme4':
## 
##     lmList
## The following object is masked from 'package:dplyr':
## 
##     collapse
## 
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
## 
##     describe
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(readr)
library(dplyr)
prb<-read_csv("C:/Users/josha/Desktop/Fall 2020 Stats 1 Dem/WorkingDirectoryFall2020StatsDem1/PRB2013.csv", col_names=T)
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   Country = col_character(),
##   Continent = col_character(),
##   Region = col_character()
## )
## See spec(...) for full column specifications.
names(prb)<-tolower(names(prb))   
prb_new<-prb%>% 
mutate(Africa=ifelse(prb$continent=="Africa",yes= "Africa",no= "Not Africa"))

Question 1. What can we conclude from the summary statistics?

##The summary statistics indicate there is a difference in TFR between African and Non-African countries. African countries (4.61) have higher on average TFR compared to their Non-African counterparts (2.25).
prb_new%>%
 group_by(Africa)%>%
 summarise(means=mean(tfr, na.rm=T), sds=sd(tfr, na.rm=T), n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 4
##   Africa     means   sds     n
##   <chr>      <dbl> <dbl> <int>
## 1 Africa      4.61 1.42     55
## 2 Not Africa  2.25 0.889   153
t.test(tfr~Africa, data=prb_new)
## 
##  Welch Two Sample t-test
## 
## data:  tfr by Africa
## t = 11.559, df = 69.813, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  1.954226 2.769268
## sample estimates:
##     mean in group Africa mean in group Not Africa 
##                 4.612727                 2.250980

Question 2

##Results from the t-test indicate these mean differences highlighted are statistically significant, including rejecting the null hypothesis which stated there was no difference between the two. Additionally, the t is equal to 11.55 which lends more evidence to reject the null.

Question 3. A + C

prb_new2 <- prb_new %>% 
    filter(continent==c("Asia", "Africa")) %>%
    select(continent, imr)
data2 <- prb_new2 %>% 
summarise(means=mean(imr, na.rm=T), sds=sd(imr, na.rm=T), n=n())
t.test(imr~continent, data=prb_new2)
## 
##  Welch Two Sample t-test
## 
## data:  imr by continent
## t = 4.3481, df = 52.373, p-value = 6.359e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  15.45146 41.92711
## sample estimates:
## mean in group Africa   mean in group Asia 
##             53.56071             24.87143
##In terms of imr Asian countries report a average rate of 24.87 deaths per 1000 live births, while African countries are much higher reporting on average 53.56 deaths per 1000 live births, more than twice the amount of their Asian counterparts.

##The t-test shows the mean differences are statistically significant, we can reject the null hypothesis in favor of the alternate, there is a statistically significant difference in the mean imr of Asian and African countries. A t- value of 4.35 with 52.37 degrees of freedom indicates this.

Question 3. B

e <- ggplot(prb_new2, aes(x = continent, y = imr))
e+geom_boxplot()+
    scale_x_discrete(limits=c("Africa" , "Asia"))

ggplot(data =prb_new) +
  geom_boxplot(mapping = aes(x=continent,y=imr))
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).

Question 6

prb%>%
 group_by(continent)%>%
 summarise(means=mean(tfr, na.rm=T), sds=sd(tfr, na.rm=T), n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
##   continent     means   sds     n
##   <chr>         <dbl> <dbl> <int>
## 1 Africa         4.61 1.42     55
## 2 Asia           2.52 1.03     51
## 3 Europe         1.55 0.228    45
## 4 North America  2.21 0.546    27
## 5 Oceania        3.18 0.901    17
## 6 South America  2.5  0.476    13
##The output provided is based  on the average tfr on countries based in different continental groups.  The 6 continental groups are Africa, Asia, Oceania, South America, North America, and Europe. Of those 6 Africa has the highest average tfr, with Europe being the least, and Asia and South America being the most similar. In addition, Africa and Asia had the highest and almost identical amount of countries included in their sample sizes, with Oceania and South America having very few overall. In fact Oceania and South America combined barely have more than the United States.

#3 Question 7 and 8

m1<-aov(formula=tfr~continent, data=prb)
anova(m1)
## Analysis of Variance Table
## 
## Response: tfr
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## continent   5 266.61  53.322  57.379 < 2.2e-16 ***
## Residuals 202 187.72   0.929                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 7-The F-value indicates that there is a large amount of variation between the differences of ingroup means. As tfr is gathered from multiple countries across different continents, with varying views on fertility, and contraceptive use in general, this would explain the high F-value.

##8-According to the F-Test on tfr between continents, we can reject the null hypothesis which states there is no difference in the mean tfr, in favor of the alternate which states there is a difference in mean tfr by continent. We just do not know where.

Question 9

library(haven)
psid2013 <- read_dta("psid2013.dta")
View(psid2013)

##People in the PSID data had a mean value of 13.37 years of education with a family income 54.36 thousand.
psid2013%>%
    summarise(means=mean(educ, na.rm=T), sds=sd(educ, na.rm=T), n=n())
## # A tibble: 1 x 3
##   means   sds     n
##   <dbl> <dbl> <int>
## 1  13.4  3.19 23134
psid2013%>%
summarise(means=mean(adjfinc, na.rm=T), sds=sd(adjfinc, na.rm=T), n=n())
## # A tibble: 1 x 3
##   means   sds     n
##   <dbl> <dbl> <int>
## 1  54.4  66.0 23134

Question 10

reg <- lm(adjfinc~educ,data=psid2013)
    summary(reg)
## 
## Call:
## lm(formula = adjfinc ~ educ, data = psid2013)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -101.74  -28.94  -10.36   15.13 1751.25 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -41.1346     1.7674  -23.27   <2e-16 ***
## educ          7.1438     0.1285   55.59   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 62.07 on 22959 degrees of freedom
##   (173 observations deleted due to missingness)
## Multiple R-squared:  0.1186, Adjusted R-squared:  0.1186 
## F-statistic:  3091 on 1 and 22959 DF,  p-value: < 2.2e-16
anova(reg)
## Analysis of Variance Table
## 
## Response: adjfinc
##              Df   Sum Sq  Mean Sq F value    Pr(>F)    
## educ          1 11907473 11907473  3090.8 < 2.2e-16 ***
## Residuals 22959 88450801     3853                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
with (psid2013,plot(educ, adjfinc))
abline(reg,col="red")

boxplot(psid2013$educ)

boxplot(psid2013$adjfinc)

##1)How would you write the linear regression equation? Y= intercept – 41.1346+ slope 7.14(education in years)
##2)    Do you have any concerns that this model violates the regression assumptions? 
##Yes, assuming education and income variables don’t tend to be even distributed, it feels like there are relevant terms we are leaving out like age, race, geography of the area, in order to test my assumptions I made boxplots. The box plots do indicate skewed distribution of scores. We can assume that these variables were measured the correct way, but the error terms per the summary states they are consistent and appear to be uncorrelated. There is a lot to consider here.
#3) What’s the R output of the regression analysis? 
##4)    How would you interpret the coefficient of education? 
##According to the regression coefficient of education, there is a positive correlation between education in years and income. For every unit increase in educational experience, income will increase by a factor of 7.14 thousand dollars.
##5)    Show the analysis of variance table from this regression analysis.
3#6)    What’s the value of SSE? What does it mean? 
## [1] 3
##The value of our sse in according with our linear trend line indicates a score difference of  119,074,473, between our predicted and expected values along said line.

##Bonus

psid2013_new <- psid2013 %>% 
select (educ, adjfinc) %>%
mutate( educc = ifelse(educ <=11 , "LHghSchlDeg" , "MHghSchDeg"))
reg2 <- lm(adjfinc~educc,data=psid2013_new)
    summary(reg2)
## 
## Call:
## lm(formula = adjfinc ~ educc, data = psid2013_new)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -114.27  -31.20  -11.00   14.36 1792.31 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      27.3599     0.9896   27.65   <2e-16 ***
## educcMHghSchDeg  33.3149     1.0975   30.36   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 64.83 on 22959 degrees of freedom
##   (173 observations deleted due to missingness)
## Multiple R-squared:  0.03859,    Adjusted R-squared:  0.03854 
## F-statistic: 921.5 on 1 and 22959 DF,  p-value: < 2.2e-16
anova(reg2)
## Analysis of Variance Table
## 
## Response: adjfinc
##              Df   Sum Sq Mean Sq F value    Pr(>F)    
## educc         1  3872470 3872470  921.46 < 2.2e-16 ***
## Residuals 22959 96485805    4203                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##  How would you write the equation? 
##  What’s the R output of the regression analysis? 
##  How would you interpret the coefficient of education? 
##  How would you interpret the intercept? 


## 27.3599 + 33.3149(education)
## Having more than a high school education increases the amount of income in a family.
## At baseline having a high school education families earn on average 27.3599 thousand dollars.