library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
prb2013 <- read_csv("C:/Users/josha/Desktop/Fall 2020 Stats 1 Dem/WorkingDirectoryFall2020StatsDem1/PRB2013.csv", col_names=T)
## Parsed with column specification:
## cols(
## .default = col_double(),
## Country = col_character(),
## Continent = col_character(),
## Region = col_character()
## )
## See spec(...) for full column specifications.
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v tibble 3.0.3 v stringr 1.4.0
## v tidyr 1.1.1 v forcats 0.5.0
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'nlme'
## The following object is masked from 'package:lme4':
##
## lmList
## The following object is masked from 'package:dplyr':
##
## collapse
##
## Attaching package: 'psych'
## The following object is masked from 'package:Hmisc':
##
## describe
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(readr)
library(dplyr)
prb<-read_csv("C:/Users/josha/Desktop/Fall 2020 Stats 1 Dem/WorkingDirectoryFall2020StatsDem1/PRB2013.csv", col_names=T)
## Parsed with column specification:
## cols(
## .default = col_double(),
## Country = col_character(),
## Continent = col_character(),
## Region = col_character()
## )
## See spec(...) for full column specifications.
names(prb)<-tolower(names(prb))
prb_new<-prb%>%
mutate(Africa=ifelse(prb$continent=="Africa",yes= "Africa",no= "Not Africa"))
##The summary statistics indicate there is a difference in TFR between African and Non-African countries. African countries (4.61) have higher on average TFR compared to their Non-African counterparts (2.25).
prb_new%>%
group_by(Africa)%>%
summarise(means=mean(tfr, na.rm=T), sds=sd(tfr, na.rm=T), n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 4
## Africa means sds n
## <chr> <dbl> <dbl> <int>
## 1 Africa 4.61 1.42 55
## 2 Not Africa 2.25 0.889 153
t.test(tfr~Africa, data=prb_new)
##
## Welch Two Sample t-test
##
## data: tfr by Africa
## t = 11.559, df = 69.813, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 1.954226 2.769268
## sample estimates:
## mean in group Africa mean in group Not Africa
## 4.612727 2.250980
##Results from the t-test indicate these mean differences highlighted are statistically significant, including rejecting the null hypothesis which stated there was no difference between the two. Additionally, the t is equal to 11.55 which lends more evidence to reject the null.
prb_new2 <- prb_new %>%
filter(continent==c("Asia", "Africa")) %>%
select(continent, imr)
data2 <- prb_new2 %>%
summarise(means=mean(imr, na.rm=T), sds=sd(imr, na.rm=T), n=n())
t.test(imr~continent, data=prb_new2)
##
## Welch Two Sample t-test
##
## data: imr by continent
## t = 4.3481, df = 52.373, p-value = 6.359e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 15.45146 41.92711
## sample estimates:
## mean in group Africa mean in group Asia
## 53.56071 24.87143
##In terms of imr Asian countries report a average rate of 24.87 deaths per 1000 live births, while African countries are much higher reporting on average 53.56 deaths per 1000 live births, more than twice the amount of their Asian counterparts.
##The t-test shows the mean differences are statistically significant, we can reject the null hypothesis in favor of the alternate, there is a statistically significant difference in the mean imr of Asian and African countries. A t- value of 4.35 with 52.37 degrees of freedom indicates this.
e <- ggplot(prb_new2, aes(x = continent, y = imr))
e+geom_boxplot()+
scale_x_discrete(limits=c("Africa" , "Asia"))
ggplot(data =prb_new) +
geom_boxplot(mapping = aes(x=continent,y=imr))
## Warning: Removed 1 rows containing non-finite values (stat_boxplot).
prb%>%
group_by(continent)%>%
summarise(means=mean(tfr, na.rm=T), sds=sd(tfr, na.rm=T), n=n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 6 x 4
## continent means sds n
## <chr> <dbl> <dbl> <int>
## 1 Africa 4.61 1.42 55
## 2 Asia 2.52 1.03 51
## 3 Europe 1.55 0.228 45
## 4 North America 2.21 0.546 27
## 5 Oceania 3.18 0.901 17
## 6 South America 2.5 0.476 13
##The output provided is based on the average tfr on countries based in different continental groups. The 6 continental groups are Africa, Asia, Oceania, South America, North America, and Europe. Of those 6 Africa has the highest average tfr, with Europe being the least, and Asia and South America being the most similar. In addition, Africa and Asia had the highest and almost identical amount of countries included in their sample sizes, with Oceania and South America having very few overall. In fact Oceania and South America combined barely have more than the United States.
#3 Question 7 and 8
m1<-aov(formula=tfr~continent, data=prb)
anova(m1)
## Analysis of Variance Table
##
## Response: tfr
## Df Sum Sq Mean Sq F value Pr(>F)
## continent 5 266.61 53.322 57.379 < 2.2e-16 ***
## Residuals 202 187.72 0.929
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 7-The F-value indicates that there is a large amount of variation between the differences of ingroup means. As tfr is gathered from multiple countries across different continents, with varying views on fertility, and contraceptive use in general, this would explain the high F-value.
##8-According to the F-Test on tfr between continents, we can reject the null hypothesis which states there is no difference in the mean tfr, in favor of the alternate which states there is a difference in mean tfr by continent. We just do not know where.
library(haven)
psid2013 <- read_dta("psid2013.dta")
View(psid2013)
##People in the PSID data had a mean value of 13.37 years of education with a family income 54.36 thousand.
psid2013%>%
summarise(means=mean(educ, na.rm=T), sds=sd(educ, na.rm=T), n=n())
## # A tibble: 1 x 3
## means sds n
## <dbl> <dbl> <int>
## 1 13.4 3.19 23134
psid2013%>%
summarise(means=mean(adjfinc, na.rm=T), sds=sd(adjfinc, na.rm=T), n=n())
## # A tibble: 1 x 3
## means sds n
## <dbl> <dbl> <int>
## 1 54.4 66.0 23134
reg <- lm(adjfinc~educ,data=psid2013)
summary(reg)
##
## Call:
## lm(formula = adjfinc ~ educ, data = psid2013)
##
## Residuals:
## Min 1Q Median 3Q Max
## -101.74 -28.94 -10.36 15.13 1751.25
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -41.1346 1.7674 -23.27 <2e-16 ***
## educ 7.1438 0.1285 55.59 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 62.07 on 22959 degrees of freedom
## (173 observations deleted due to missingness)
## Multiple R-squared: 0.1186, Adjusted R-squared: 0.1186
## F-statistic: 3091 on 1 and 22959 DF, p-value: < 2.2e-16
anova(reg)
## Analysis of Variance Table
##
## Response: adjfinc
## Df Sum Sq Mean Sq F value Pr(>F)
## educ 1 11907473 11907473 3090.8 < 2.2e-16 ***
## Residuals 22959 88450801 3853
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
with (psid2013,plot(educ, adjfinc))
abline(reg,col="red")
boxplot(psid2013$educ)
boxplot(psid2013$adjfinc)
##1)How would you write the linear regression equation? Y= intercept – 41.1346+ slope 7.14(education in years)
##2) Do you have any concerns that this model violates the regression assumptions?
##Yes, assuming education and income variables don’t tend to be even distributed, it feels like there are relevant terms we are leaving out like age, race, geography of the area, in order to test my assumptions I made boxplots. The box plots do indicate skewed distribution of scores. We can assume that these variables were measured the correct way, but the error terms per the summary states they are consistent and appear to be uncorrelated. There is a lot to consider here.
#3) What’s the R output of the regression analysis?
##4) How would you interpret the coefficient of education?
##According to the regression coefficient of education, there is a positive correlation between education in years and income. For every unit increase in educational experience, income will increase by a factor of 7.14 thousand dollars.
##5) Show the analysis of variance table from this regression analysis.
3#6) What’s the value of SSE? What does it mean?
## [1] 3
##The value of our sse in according with our linear trend line indicates a score difference of 119,074,473, between our predicted and expected values along said line.
##Bonus
psid2013_new <- psid2013 %>%
select (educ, adjfinc) %>%
mutate( educc = ifelse(educ <=11 , "LHghSchlDeg" , "MHghSchDeg"))
reg2 <- lm(adjfinc~educc,data=psid2013_new)
summary(reg2)
##
## Call:
## lm(formula = adjfinc ~ educc, data = psid2013_new)
##
## Residuals:
## Min 1Q Median 3Q Max
## -114.27 -31.20 -11.00 14.36 1792.31
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 27.3599 0.9896 27.65 <2e-16 ***
## educcMHghSchDeg 33.3149 1.0975 30.36 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 64.83 on 22959 degrees of freedom
## (173 observations deleted due to missingness)
## Multiple R-squared: 0.03859, Adjusted R-squared: 0.03854
## F-statistic: 921.5 on 1 and 22959 DF, p-value: < 2.2e-16
anova(reg2)
## Analysis of Variance Table
##
## Response: adjfinc
## Df Sum Sq Mean Sq F value Pr(>F)
## educc 1 3872470 3872470 921.46 < 2.2e-16 ***
## Residuals 22959 96485805 4203
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## How would you write the equation?
## What’s the R output of the regression analysis?
## How would you interpret the coefficient of education?
## How would you interpret the intercept?
## 27.3599 + 33.3149(education)
## Having more than a high school education increases the amount of income in a family.
## At baseline having a high school education families earn on average 27.3599 thousand dollars.